summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
commitd0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /kernel
parente914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c147
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/bpf/arraymap.c163
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/hashtab.c84
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/verifier.c26
-rw-r--r--kernel/cgroup.c77
-rw-r--r--kernel/cgroup_pids.c34
-rw-r--r--kernel/configs/android-base.config152
-rw-r--r--kernel/configs/android-recommended.config121
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c28
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c437
-rw-r--r--kernel/events/internal.h25
-rw-r--r--kernel/events/ring_buffer.c15
-rw-r--r--kernel/exit.c86
-rw-r--r--kernel/fork.c40
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c23
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c63
-rw-r--r--kernel/irq/chip.c98
-rw-r--r--kernel/irq/handle.c18
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irqdesc.c63
-rw-r--r--kernel/irq/irqdomain.c94
-rw-r--r--kernel/irq/manage.c77
-rw-r--r--kernel/irq/msi.c3
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/jump_label.c63
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kexec_core.c69
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/livepatch/core.c2
-rw-r--r--kernel/locking/lockdep.c13
-rw-r--r--kernel/locking/mutex-debug.h4
-rw-r--r--kernel/locking/mutex.h10
-rw-r--r--kernel/locking/qrwlock.c2
-rw-r--r--kernel/locking/qspinlock.c88
-rw-r--r--kernel/locking/qspinlock_paravirt.h6
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c194
-rw-r--r--kernel/locking/rwsem.c8
-rw-r--r--kernel/locking/rwsem.h52
-rw-r--r--kernel/memremap.c14
-rw-r--r--kernel/module.c109
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/power/Kconfig278
-rw-r--r--kernel/power/Makefile32
-rw-r--r--kernel/power/console.c8
-rw-r--r--kernel/power/hibernate.c141
-rw-r--r--kernel/power/main.c11
-rw-r--r--kernel/power/power.h48
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/qos.c11
-rw-r--r--kernel/power/snapshot.c1249
-rw-r--r--kernel/power/suspend.c10
-rw-r--r--kernel/power/swap.c39
-rw-r--r--kernel/power/user.c14
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/printk/nmi.c38
-rw-r--r--kernel/printk/printk.c190
-rw-r--r--kernel/profile.c181
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcu/rcuperf.c25
-rw-r--r--kernel/rcu/rcutorture.c9
-rw-r--r--kernel/rcu/tree.c691
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h655
-rw-r--r--kernel/rcu/tree_plugin.h95
-rw-r--r--kernel/rcu/update.c7
-rw-r--r--kernel/relay.c34
-rw-r--r--kernel/sched/bfs.c500
-rw-r--r--kernel/sched/bfs_sched.h2
-rw-r--r--kernel/sched/core.c141
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c74
-rw-r--r--kernel/sched/cputime.c203
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c251
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/seccomp.c154
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/smp.c81
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sysctl.c67
-rw-r--r--kernel/task_work.c11
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/hrtimer.c42
-rw-r--r--kernel/time/test_udelay.c16
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c95
-rw-r--r--kernel/time/timeconv.c11
-rw-r--r--kernel/time/timekeeping.c18
-rw-r--r--kernel/time/timer.c1133
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/torture.c176
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c83
-rw-r--r--kernel/trace/bpf_trace.c164
-rw-r--r--kernel/trace/ftrace.c313
-rw-r--r--kernel/trace/trace.c358
-rw-r--r--kernel/trace/trace.h48
-rw-r--r--kernel/trace/trace_entries.h4
-rw-r--r--kernel/trace/trace_events.c219
-rw-r--r--kernel/trace/trace_events_hist.c14
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_probe.c33
-rw-r--r--kernel/trace/trace_probe.h10
-rw-r--r--kernel/user_namespace.c14
-rw-r--r--kernel/workqueue.c112
131 files changed, 6497 insertions, 4614 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 8d528f993..a8a91bd2b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -932,7 +932,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(msg_type);
+ err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
@@ -1379,7 +1379,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
- if (unlikely(audit_filter_type(type)))
+ if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
return NULL;
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
diff --git a/kernel/audit.h b/kernel/audit.h
index a492f4c4e..431444c37 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -331,6 +331,8 @@ extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
+extern int audit_filter(int msgtype, unsigned int listtype);
+
#ifdef CONFIG_AUDITSYSCALL
extern int __audit_signal_info(int sig, struct task_struct *t);
static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 94ca7b1e5..85d9cac49 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct audit_krule *rule, int type,
- enum audit_state *state)
+int audit_filter(int msgtype, unsigned int listtype)
{
- int i;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &rule->fields[i];
- pid_t pid;
- int result = 0;
- u32 sid;
-
- switch (f->type) {
- case AUDIT_PID:
- pid = task_pid_nr(current);
- result = audit_comparator(pid, f->op, f->val);
- break;
- case AUDIT_UID:
- result = audit_uid_comparator(current_uid(), f->op, f->uid);
- break;
- case AUDIT_GID:
- result = audit_gid_comparator(current_gid(), f->op, f->gid);
- break;
- case AUDIT_LOGINUID:
- result = audit_uid_comparator(audit_get_loginuid(current),
- f->op, f->uid);
- break;
- case AUDIT_LOGINUID_SET:
- result = audit_comparator(audit_loginuid_set(current),
- f->op, f->val);
- break;
- case AUDIT_MSGTYPE:
- result = audit_comparator(type, f->op, f->val);
- break;
- case AUDIT_SUBJ_USER:
- case AUDIT_SUBJ_ROLE:
- case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
- if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
- result = security_audit_rule_match(sid,
- f->type,
- f->op,
- f->lsm_rule,
- NULL);
- }
- break;
- }
-
- if (!result)
- return 0;
- }
- switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
- }
- return 1;
-}
-
-int audit_filter_user(int type)
-{
- enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int rc, ret;
-
- ret = 1; /* Audit by default */
-
- rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- rc = audit_filter_user_rules(&e->rule, type, &state);
- if (rc) {
- if (rc > 0 && state == AUDIT_DISABLED)
- ret = 0;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-int audit_filter_type(int type)
-{
- struct audit_entry *e;
- int result = 0;
+ int ret = 1; /* Audit by default */
rcu_read_lock();
- if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+ if (list_empty(&audit_filter_list[listtype]))
goto unlock_and_return;
+ list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
+ int i, result = 0;
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
- list) {
- int i;
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
- if (f->type == AUDIT_MSGTYPE) {
- result = audit_comparator(type, f->op, f->val);
- if (!result)
- break;
+ pid_t pid;
+ u32 sid;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
+ break;
+ case AUDIT_GID:
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(msgtype, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
+ f->type, f->op, f->lsm_rule, NULL);
+ }
+ break;
+ default:
+ goto unlock_and_return;
}
+ if (result < 0) /* error */
+ goto unlock_and_return;
+ if (!result)
+ break;
+ }
+ if (result > 0) {
+ if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+ ret = 0;
+ break;
}
- if (result)
- goto unlock_and_return;
}
unlock_and_return:
rcu_read_unlock();
- return result;
+ return ret;
}
static int update_lsm_rule(struct audit_krule *r)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b3341284f..5abf1dc1f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -696,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk,
ctx->prio = rule->prio;
}
switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ case AUDIT_NEVER:
+ *state = AUDIT_DISABLED;
+ break;
+ case AUDIT_ALWAYS:
+ *state = AUDIT_RECORD_CONTEXT;
+ break;
}
return 1;
}
@@ -1421,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, " cwd=", &context->pwd);
+ audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e..633a650d7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
}
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+
if (IS_ERR(prog))
return prog;
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+
return prog;
}
static void prog_fd_array_put_ptr(void *ptr)
{
- struct bpf_prog *prog = ptr;
-
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(ptr);
}
/* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
}
late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
{
- bpf_fd_array_map_clear(map);
- fd_array_map_free(map);
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
}
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
{
- struct perf_event *event;
- const struct perf_event_attr *attr;
- struct file *file;
+ struct bpf_event_entry *ee;
- file = perf_event_get(fd);
- if (IS_ERR(file))
- return file;
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
- event = file->private_data;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
- attr = perf_event_attrs(event);
- if (IS_ERR(attr))
- goto err;
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
- if (attr->inherit)
- goto err;
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
- if (attr->type == PERF_TYPE_RAW)
- return file;
+ event = perf_file->private_data;
+ ee = ERR_PTR(-EINVAL);
- if (attr->type == PERF_TYPE_HARDWARE)
- return file;
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr) || attr->inherit)
+ goto err_out;
+
+ switch (attr->type) {
+ case PERF_TYPE_SOFTWARE:
+ if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+ goto err_out;
+ /* fall-through */
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+ /* fall-through */
+ default:
+ break;
+ }
- if (attr->type == PERF_TYPE_SOFTWARE &&
- attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return file;
-err:
- fput(file);
- return ERR_PTR(-EINVAL);
+err_out:
+ fput(perf_file);
+ return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- fput((struct file *)ptr);
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ fd_array_map_delete_elem(map, &i);
+ }
+ rcu_read_unlock();
}
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = fd_array_map_alloc,
- .map_free = perf_event_array_map_free,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
return 0;
}
late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+ .ops = &cgroup_array_ops,
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+ bpf_register_map_type(&cgroup_array_type);
+ return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550..03fd23d4d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
-
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
prog = READ_ONCE(array->ptrs[index]);
- if (unlikely(!prog))
+ if (!prog)
goto out;
/* ARG1 at this point is guaranteed to point to CTX from
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- return NULL;
+ return -ENOTSUPP;
}
/* Always built-in helper functions. */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d5..570eeca7b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
struct bucket *buckets;
void *elems;
struct pcpu_freelist freelist;
+ void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
+enum extra_elem_state {
+ HTAB_NOT_AN_EXTRA_ELEM = 0,
+ HTAB_EXTRA_ELEM_FREE,
+ HTAB_EXTRA_ELEM_USED
+};
+
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -38,7 +45,10 @@ struct htab_elem {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
};
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ enum extra_elem_state state;
+ };
u32 hash;
char key[0] __aligned(8);
};
@@ -113,6 +123,23 @@ free_elems:
return err;
}
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+ void __percpu *pptr;
+ int cpu;
+
+ pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+ HTAB_EXTRA_ELEM_FREE;
+ }
+ htab->extra_elems = pptr;
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries;
+ else
+ cost += (u64) htab->elem_size * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
+ if (!percpu) {
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_buckets;
+ }
+
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
err = prealloc_elems_and_freelist(htab);
if (err)
- goto free_buckets;
+ goto free_extra_elems;
}
return &htab->map;
+free_extra_elems:
+ free_percpu(htab->extra_elems);
free_buckets:
kvfree(htab->buckets);
free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
kfree(l);
-
}
static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
+ if (l->state == HTAB_EXTRA_ELEM_USED) {
+ l->state = HTAB_EXTRA_ELEM_FREE;
+ return;
+ }
+
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
- bool percpu, bool onallcpus)
+ bool percpu, bool onallcpus,
+ bool old_elem_exists)
{
u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new;
void __percpu *pptr;
+ int err = 0;
if (prealloc) {
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
if (!l_new)
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
+ } else {
+ l_new = kmalloc(htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
+ }
+
+ if (err) {
+ if (!old_elem_exists)
+ return ERR_PTR(err);
+
+ /* if we're updating the existing element and the hash table
+ * is full, use per-cpu extra elems
+ */
+ l_new = this_cpu_ptr(htab->extra_elems);
+ if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+ return ERR_PTR(-E2BIG);
+ l_new->state = HTAB_EXTRA_ELEM_USED;
+ } else {
+ l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
}
memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
- l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+ !!l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
}
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus);
+ hash, true, onallcpus, false);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
htab_free_elems(htab);
pcpu_freelist_destroy(&htab->freelist);
}
+ free_percpu(htab->extra_elems);
kvfree(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f..1ea3afba1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
- return raw_smp_processor_id();
+ return smp_processor_id();
}
const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb..5967b870a 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
* version 2 as published by the Free Software Foundation.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("bpf");
-
static int __init bpf_init(void)
{
int ret;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 080a2dfb5..bf4495fcd 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (err)
goto free_smap;
- err = get_callchain_buffers();
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
goto free_smap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b7..228f96244 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
static int bpf_map_release(struct inode *inode, struct file *filp)
{
- bpf_map_put_with_uref(filp->private_data);
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
- if (atomic_dec_and_test(&prog->aux->refcnt))
- call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
- __prog_put_common(&prog->aux->rcu);
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
return 0;
}
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = __bpf_prog_get(f);
+ prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
+ if (type && prog->type != *type) {
+ prog = ERR_PTR(-EINVAL);
+ goto out;
+ }
prog = bpf_prog_inc(prog);
+out:
fdput(f);
-
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+ return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6d011c693..daea765d7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -654,6 +654,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_XDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_packet_access(struct verifier_env *env, u32 regno, int off,
int size)
{
@@ -714,6 +724,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
switch (env->prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
break;
default:
verbose("verifier is misconfigured\n");
@@ -806,10 +817,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE) {
+ if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
verbose("cannot write into packet\n");
return -EACCES;
}
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -1036,6 +1052,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_under_cgroup)
+ goto error;
+ break;
default:
break;
}
@@ -1055,6 +1075,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+ case BPF_FUNC_skb_under_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
default:
break;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 129a7ca5f..d6b729beb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,7 @@
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
+#include <linux/file.h>
#include <net/sock.h>
/*
@@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
{
lockdep_assert_held(&cgroup_mutex);
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
static void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
- /* hierarchy ID should already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
-
idr_destroy(&root->cgroup_idr);
kfree(root);
}
@@ -3452,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
- if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
- ret = -EBUSY;
- goto out_unlock;
+ if (enable && cgroup_parent(cgrp)) {
+ struct cgrp_cset_link *link;
+
+ /*
+ * Because namespaces pin csets too, @cgrp->cset_links
+ * might not be empty even when @cgrp is empty. Walk and
+ * verify each cset.
+ */
+ spin_lock_irq(&css_set_lock);
+
+ ret = 0;
+ list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+ if (css_set_populated(link->cset)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ spin_unlock_irq(&css_set_lock);
+
+ if (ret)
+ goto out_unlock;
}
/* save and update control masks and prepare csses */
@@ -3905,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile)
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
- * Return the number of tasks in the cgroup.
+ * Return the number of tasks in the cgroup. The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
*/
static int cgroup_task_count(const struct cgroup *cgrp)
{
@@ -5147,6 +5162,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(parent_css);
+ if (!css)
+ css = ERR_PTR(-ENOMEM);
if (IS_ERR(css))
return css;
@@ -6173,7 +6190,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+ return idr_find(&ss->css_idr, id);
}
/**
@@ -6210,6 +6227,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory. Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+ struct file *f;
+
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ fput(f);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37..2bd673783 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
*/
atomic64_t counter;
int64_t limit;
+
+ /* Handle for "pids.events" */
+ struct cgroup_file events_file;
+
+ /* Number of times fork failed because limit was hit. */
+ atomic64_t events_limit;
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
+ int err;
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- return pids_try_charge(pids, 1);
+ err = pids_try_charge(pids, 1);
+ if (err) {
+ /* Only log the first time events_limit is incremented. */
+ if (atomic64_inc_return(&pids->events_limit) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&pids->events_file);
+ }
+ return err;
}
static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+ return 0;
+}
+
static struct cftype pids_files[] = {
{
.name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
.read_s64 = pids_current_read,
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000..9f748ed7b
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000..e3b953e96
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b61887f7..341bf80f8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
if (!cpu_online(cpu))
return 0;
+ /*
+ * If we are up and running, use the hotplug thread. For early calls
+ * we invoke the thread function directly.
+ */
+ if (!st->thread)
+ return cpuhp_invoke_callback(cpu, state, cb);
+
st->cb_state = state;
st->cb = cb;
/*
@@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown = NULL,
.cant_stop = true,
},
+ [CPUHP_PERF_PREPARE] = {
+ .name = "perf prepare",
+ .startup = perf_event_init_cpu,
+ .teardown = perf_event_exit_cpu,
+ },
+ [CPUHP_WORKQUEUE_PREP] = {
+ .name = "workqueue prepare",
+ .startup = workqueue_prepare_cpu,
+ .teardown = NULL,
+ },
+ [CPUHP_HRTIMERS_PREPARE] = {
+ .name = "hrtimers prepare",
+ .startup = hrtimers_prepare_cpu,
+ .teardown = hrtimers_dead_cpu,
+ },
+ [CPUHP_SMPCFD_PREPARE] = {
+ .name = "SMPCFD prepare",
+ .startup = smpcfd_prepare_cpu,
+ .teardown = smpcfd_dead_cpu,
+ },
+ [CPUHP_RCUTREE_PREP] = {
+ .name = "RCU-tree prepare",
+ .startup = rcutree_prepare_cpu,
+ .teardown = rcutree_dead_cpu,
+ },
/*
* Preparatory and dead notifiers. Will be replaced once the notifiers
* are converted to states.
@@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.skip_onerr = true,
.cant_stop = true,
},
+ /*
+ * On the tear-down path, timers_dead_cpu() must be invoked
+ * before blk_mq_queue_reinit_notify() from notify_dead(),
+ * otherwise a RCU stall occurs.
+ */
+ [CPUHP_TIMERS_DEAD] = {
+ .name = "timers dead",
+ .startup = NULL,
+ .teardown = timers_dead_cpu,
+ },
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
@@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown = NULL,
.cant_stop = true,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .startup = NULL,
+ .teardown = smpcfd_dying_cpu,
+ },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
@@ -1227,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup = sched_cpu_starting,
.teardown = sched_cpu_dying,
},
+ [CPUHP_AP_RCUTREE_DYING] = {
+ .startup = NULL,
+ .teardown = rcutree_dying_cpu,
+ },
/*
* Low level startup/teardown notifiers. Run with interrupts
* disabled. Will be removed once the notifiers are converted to
@@ -1250,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup = smpboot_unpark_threads,
.teardown = NULL,
},
+ [CPUHP_AP_PERF_ONLINE] = {
+ .name = "perf online",
+ .startup = perf_event_init_cpu,
+ .teardown = perf_event_exit_cpu,
+ },
+ [CPUHP_AP_WORKQUEUE_ONLINE] = {
+ .name = "workqueue online",
+ .startup = workqueue_online_cpu,
+ .teardown = workqueue_offline_cpu,
+ },
+ [CPUHP_AP_RCUTREE_ONLINE] = {
+ .name = "RCU-tree online",
+ .startup = rcutree_online_cpu,
+ .teardown = rcutree_offline_cpu,
+ },
+
/*
* Online/down_prepare notifiers. Will be removed once the notifiers
* are converted to states.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 40b6ed559..2b4c20ab5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -1034,15 +1045,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return;
-
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
@@ -2083,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
* which could have been changed by cpuset just after it inherits the
* state from the parent and before it sits on the cgroup's task list.
*/
-void cpuset_fork(struct task_struct *task)
+static void cpuset_fork(struct task_struct *task)
{
if (task_css_is_root(task, cpuset_cgrp_id))
return;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640..e9fdb5203 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
return -ENOMEM;
}
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
/* If the allocation failed, give up */
if (!callchain_cpus_entries)
err = -ENOMEM;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack)
+ err = -EOVERFLOW;
goto exit;
}
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
if (!kernel && !user)
return NULL;
- return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+ return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
}
struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e68c0a735..fc9bb2225 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -371,6 +371,7 @@ static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -432,6 +433,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0)
+ return -EINVAL;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -476,7 +484,7 @@ static u64 __report_allowed;
static void perf_duration_warn(struct irq_work *w)
{
- printk_ratelimited(KERN_WARNING
+ printk_ratelimited(KERN_INFO
"perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
__report_avg, __report_allowed,
@@ -871,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
}
}
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+ /*
+ * Because cgroup events are always per-cpu events,
+ * this will always be called from the right CPU.
+ */
+ cpuctx = __get_cpu_context(ctx);
+ cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
#else /* !CONFIG_CGROUP_PERF */
static inline bool
@@ -948,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+}
+
#endif
/*
@@ -1420,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1440,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
- if (is_cgroup_event(event))
- ctx->nr_cgroups++;
+ list_update_cgroup_event(event, ctx, true);
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
@@ -1609,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -1622,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- if (is_cgroup_event(event)) {
- ctx->nr_cgroups--;
- /*
- * Because cgroup events are always per-cpu events, this will
- * always be called from the right CPU.
- */
- cpuctx = __get_cpu_context(ctx);
- /*
- * If there are no more cgroup events then clear cgrp to avoid
- * stale pointer in update_cgrp_time_from_cpuctx().
- */
- if (!ctx->nr_cgroups)
- cpuctx->cgrp = NULL;
- }
+ list_update_cgroup_event(event, ctx, false);
ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1744,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
static inline int
event_filter_match(struct perf_event *event)
{
- return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event) && pmu_filter_match(event);
+ return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+ perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1765,8 +1791,8 @@ event_sched_out(struct perf_event *event,
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
+ if (event->state == PERF_EVENT_STATE_INACTIVE &&
+ !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
@@ -2264,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
- event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
+ /*
+ * Ensures that if we can observe event->ctx, both the event and ctx
+ * will be 'complete'. See perf_iterate_sb_cpu().
+ */
+ smp_store_release(&event->ctx, ctx);
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -2465,11 +2496,11 @@ static int __perf_event_stop(void *info)
return 0;
}
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
{
struct stop_event_data sd = {
.event = event,
- .restart = 1,
+ .restart = restart,
};
int ret = 0;
@@ -3518,8 +3549,17 @@ static int perf_event_read(struct perf_event *event, bool group)
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
+ /*
+ * Purposely ignore the smp_call_function_single() return
+ * value.
+ *
+ * If event->oncpu isn't a valid CPU it means the event got
+ * scheduled out and that will have updated the event count.
+ *
+ * Therefore, either way, we'll have an up-to-date event count
+ * after this.
+ */
+ (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -3722,6 +3762,39 @@ static void free_event_rcu(struct rcu_head *head)
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
+static void detach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_del_rcu(&event->sb_list);
+ raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (event->parent)
+ return false;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return false;
+
+ if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+ attr->comm || attr->comm_exec ||
+ attr->task ||
+ attr->context_switch)
+ return true;
+ return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ detach_sb_event(event);
+}
+
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -3785,6 +3858,8 @@ static void unaccount_event(struct perf_event *event)
}
unaccount_event_cpu(event, event->cpu);
+
+ unaccount_pmu_sb_event(event);
}
static void perf_sched_delayed(struct work_struct *work)
@@ -3854,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event)
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
- if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ if ((e1->pmu == e2->pmu) &&
(e1->cpu == e2->cpu ||
e1->cpu == -1 ||
e2->cpu == -1))
@@ -4770,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event,
spin_unlock_irqrestore(&rb->event_lock, flags);
}
+ /*
+ * Avoid racing with perf_mmap_close(AUX): stop the event
+ * before swizzling the event::rb pointer; if it's getting
+ * unmapped, its aux_mmap_count will be 0 and it won't
+ * restart. See the comment in __perf_pmu_output_stop().
+ *
+ * Data will inevitably be lost when set_output is done in
+ * mid-air, but then again, whoever does it like this is
+ * not in for the data anyway.
+ */
+ if (has_aux(event))
+ perf_event_stop(event, 0);
+
rcu_assign_pointer(event->rb, rb);
if (old_rb) {
@@ -5610,16 +5698,26 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- u32 raw_size = data->raw->size;
- u32 real_size = round_up(raw_size + sizeof(u32),
- sizeof(u64)) - sizeof(u32);
- u64 zero = 0;
-
- perf_output_put(handle, real_size);
- __output_copy(handle, data->raw->data, raw_size);
- if (real_size - raw_size)
- __output_copy(handle, &zero, real_size - raw_size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5744,14 +5842,28 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
-
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ struct perf_raw_record *raw = data->raw;
+ int size;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+ u32 sum = 0;
+
+ do {
+ sum += frag->size;
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+
+ size = round_up(sum + sizeof(u32), sizeof(u64));
+ raw->size = size - sizeof(u32);
+ frag->pad = raw->size - sum;
+ } else {
+ size = sizeof(u64);
+ }
- header->size += round_up(size, sizeof(u64));
+ header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5911,11 +6023,11 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+ perf_iterate_f output,
void *data, bool all)
{
struct perf_event *event;
@@ -5932,52 +6044,63 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
}
}
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
- struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
- rcu_read_lock();
- preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data, false);
- preempt_enable();
- rcu_read_unlock();
+ struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ /*
+ * Skip events that are not fully formed yet; ensure that
+ * if we observe event->ctx, both event and ctx will be
+ * complete enough. See perf_install_in_context().
+ */
+ if (!smp_load_acquire(&event->ctx))
+ continue;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
}
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
- struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
int ctxn;
+ rcu_read_lock();
+ preempt_disable();
+
/*
- * If we have task_ctx != NULL we only notify
- * the task context itself. The task_ctx is set
- * only for EXIT events before releasing task
+ * If we have task_ctx != NULL we only notify the task context itself.
+ * The task_ctx is set only for EXIT events before releasing task
* context.
*/
if (task_ctx) {
- perf_event_aux_task_ctx(output, data, task_ctx);
- return;
+ perf_iterate_ctx(task_ctx, output, data, false);
+ goto done;
}
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ perf_iterate_sb_cpu(output, data);
+
+ for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data, false);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
+ perf_iterate_ctx(ctx, output, data, false);
}
+done:
+ preempt_enable();
rcu_read_unlock();
}
@@ -6010,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (restart)
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
void perf_event_exec(void)
@@ -6026,7 +6149,7 @@ void perf_event_exec(void)
perf_event_enable_on_exec(ctxn);
- perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
true);
}
rcu_read_unlock();
@@ -6054,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
/*
* In case of inheritance, it will be the parent that links to the
- * ring-buffer, but it will be the child that's actually using it:
+ * ring-buffer, but it will be the child that's actually using it.
+ *
+ * We are using event::rb to determine if the event should be stopped,
+ * however this may race with ring_buffer_attach() (through set_output),
+ * which will make us skip the event that actually needs to be stopped.
+ * So ring_buffer_attach() has to stop an aux event before re-assigning
+ * its rb pointer.
*/
if (rcu_dereference(parent->rb) == rb)
ro->err = __perf_event_stop(&sd);
@@ -6064,15 +6193,15 @@ static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
struct pmu *pmu = event->pmu;
- struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
rcu_read_lock();
- perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
if (cpuctx->task_ctx)
- perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
&ro, false);
rcu_read_unlock();
@@ -6201,7 +6330,7 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_output,
+ perf_iterate_sb(perf_event_task_output,
&task_event,
task_ctx);
}
@@ -6280,7 +6409,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_output,
+ perf_iterate_sb(perf_event_comm_output,
comm_event,
NULL);
}
@@ -6511,7 +6640,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_output,
+ perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -6519,15 +6648,6 @@ got_name:
}
/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
- return filter->filter && filter->inode;
-}
-
-/*
* Check whether inode and address range match filter criteria.
*/
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6577,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (restart)
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
/*
@@ -6588,13 +6708,20 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * Data tracing isn't supported yet and as such there is no need
+ * to keep track of anything that isn't related to executable code:
+ */
+ if (!(vma->vm_flags & VM_EXEC))
+ return;
+
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (!ctx)
continue;
- perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
}
rcu_read_unlock();
}
@@ -6781,7 +6908,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- perf_event_aux(perf_event_switch_output,
+ perf_iterate_sb(perf_event_switch_output,
&switch_event,
NULL);
}
@@ -7388,7 +7515,7 @@ static struct pmu perf_swevent = {
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
- void *record = data->raw->data;
+ void *record = data->raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -7444,8 +7571,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
perf_sample_data_init(&data, 0, 0);
@@ -7586,7 +7715,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
}
}
@@ -7738,7 +7867,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
list_for_each_entry(filter, &ifh->list, entry) {
event->addr_filters_offs[count] = 0;
- if (perf_addr_filter_needs_mmap(filter))
+ /*
+ * Adjust base offset if the filter is associated to a binary
+ * that needs to be mapped:
+ */
+ if (filter->inode)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -7753,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
mmput(mm);
restart:
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
/*
@@ -7869,8 +8002,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail;
}
- if (token == IF_SRC_FILE) {
- filename = match_strdup(&args[2]);
+ if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+ int fpos = filter->range ? 2 : 1;
+
+ filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
goto fail;
@@ -8703,6 +8838,28 @@ unlock:
return pmu;
}
+static void attach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_add_rcu(&event->sb_list, &pel->list);
+ raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ attach_sb_event(event);
+}
+
static void account_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -8783,6 +8940,8 @@ static void account_event(struct perf_event *event)
enabled:
account_event_cpu(event, event->cpu);
+
+ account_pmu_sb_event(event);
}
/*
@@ -8931,7 +9090,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- err = get_callchain_buffers();
+ err = get_callchain_buffers(attr->sample_max_stack);
if (err)
goto err_addr_filters;
}
@@ -9253,6 +9412,9 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ if (!attr.sample_max_stack)
+ attr.sample_max_stack = sysctl_perf_event_max_stack;
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9326,7 +9488,7 @@ SYSCALL_DEFINE5(perf_event_open,
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto err_alloc;
}
}
@@ -10288,10 +10450,13 @@ static void __init perf_event_init_all_cpus(void)
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+ raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
}
}
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10304,6 +10469,7 @@ static void perf_event_init_cpu(int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
+ return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10335,14 +10501,17 @@ static void perf_event_exit_cpu_context(int cpu)
}
srcu_read_unlock(&pmus_srcu, idx);
}
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
+
+#endif
-static void perf_event_exit_cpu(int cpu)
+int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
+ return 0;
}
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10364,46 +10533,6 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- /*
- * This must be done before the CPU comes alive, because the
- * moment we can run tasks we can encounter (software) events.
- *
- * Specifically, someone can have inherited events on kthreadd
- * or a pre-existing worker thread that gets re-bound.
- */
- perf_event_init_cpu(cpu);
- break;
-
- case CPU_DOWN_PREPARE:
- /*
- * This must be done before the CPU dies because after that an
- * active event might want to IPI the CPU and that'll not work
- * so great for dead CPUs.
- *
- * XXX smp_call_function_single() return -ENXIO without a warn
- * so we could possibly deal with this.
- *
- * This is safe against new events arriving because
- * sys_perf_event_open() serializes against hotplug using
- * get_online_cpus().
- */
- perf_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
void __init perf_event_init(void)
{
int ret;
@@ -10416,7 +10545,7 @@ void __init perf_event_init(void)
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
- perf_cpu_notifier(perf_cpu_notify);
+ perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d62..486fd78eb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ae9b90dc9..257fa460b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!rb)
return NULL;
- if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ if (!rb_has_aux(rb))
goto err;
/*
- * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
- * the aux buffer is in perf_mmap_close(), about to get freed.
+ * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+ * about to get freed, so we leave immediately.
+ *
+ * Checking rb::aux_mmap_count and rb::refcount has to be done in
+ * the same order, see perf_mmap_close. Otherwise we end up freeing
+ * aux pages in this path, which is a bug, because in_atomic().
*/
if (!atomic_read(&rb->aux_mmap_count))
- goto err_put;
+ goto err;
+
+ if (!atomic_inc_not_zero(&rb->aux_refcount))
+ goto err;
/*
* Nesting is not supported for AUX area, make sure nested
diff --git a/kernel/exit.c b/kernel/exit.c
index 10011db7b..c22b37fb8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
}
/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+ struct sighand_struct *sighand;
+ struct task_struct *task;
+
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ return NULL;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ smp_rmb();
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * We've re-checked that "task == *ptask", now we have two different
+ * cases:
+ *
+ * 1. This is actually the same task/task_struct. In this case
+ * sighand != NULL tells us it is still alive.
+ *
+ * 2. This is another task which got the same memory for task_struct.
+ * We can't know this of course, and we can not trust
+ * sighand != NULL.
+ *
+ * In this case we actually return a random value, but this is
+ * correct.
+ *
+ * If we return NULL - we can pretend that we actually noticed that
+ * *ptask was updated when the previous task has exited. Or pretend
+ * that probe_slab_address(&sighand) reads NULL.
+ *
+ * If we return the new task (because sighand is not NULL for any
+ * reason) - this is fine too. This (new) task can't go away before
+ * another gp pass.
+ *
+ * And note: We could even eliminate the false positive if re-read
+ * task->sighand once again to avoid the falsely NULL. But this case
+ * is very unlikely so we don't care.
+ */
+ if (!sighand)
+ return NULL;
+
+ return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = task_rcu_dereference(ptask);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
+/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -639,7 +715,7 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
@@ -700,10 +776,14 @@ void do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
/*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * Ensure that all new tsk->pi_lock acquisitions must observe
+ * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
*/
smp_mb();
+ /*
+ * Ensure that we must observe the pi_state in exit_mm() ->
+ * mm_release() -> exit_pi_state_list().
+ */
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) {
diff --git a/kernel/fork.c b/kernel/fork.c
index d7485f782..adb2389f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -139,7 +139,7 @@ static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
- return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
static inline void free_task_struct(struct task_struct *tsk)
@@ -162,23 +162,15 @@ void __weak arch_release_thread_stack(unsigned long *stack)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
-
- if (page)
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- 1 << THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_stack(unsigned long *stack)
{
- struct page *page = virt_to_page(stack);
-
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -(1 << THREAD_SIZE_ORDER));
- __free_kmem_pages(page, THREAD_SIZE_ORDER);
+ __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(unsigned long *stack, int account)
{
- struct zone *zone = page_zone(virt_to_page(stack));
+ /* All stack pages are in the same zone and belong to the same memcg. */
+ struct page *first_page = virt_to_page(stack);
- mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
+
+ memcg_kmem_update_page_stat(
+ first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
void free_task(struct task_struct *tsk)
@@ -459,7 +457,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto fail_nomem;
charge = len;
}
- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
@@ -512,7 +510,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;
-
+ uksm_vma_add_new(tmp);
mm->map_count++;
retval = copy_page_range(mm, oldmm, mpnt);
@@ -938,14 +936,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
+ if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc..6f56a9e21 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e..46cb3a301 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
-#define FLAGS_SHARED 0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
if (!key->both.ptr)
return;
+ /*
+ * On MMU less systems futexes are always "private" as there is no per
+ * process address space. We need the smp wmb nevertheless - yes,
+ * arch/blackfin has MMU less SMP ...
+ */
+ if (!IS_ENABLED(CONFIG_MMU)) {
+ smp_mb(); /* explicit smp_mb(); (B) */
+ return;
+ }
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
return;
}
+ if (!IS_ENABLED(CONFIG_MMU))
+ return;
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
iput(key->shared.inode);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a..1d3ee3169 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000..32f6cfcff
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,63 @@
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+static int get_first_sibling(unsigned int cpu)
+{
+ unsigned int ret;
+
+ ret = cpumask_first(topology_sibling_cpumask(cpu));
+ if (ret < nr_cpu_ids)
+ return ret;
+ return cpu;
+}
+
+/*
+ * Take a map of online CPUs and the number of available interrupt vectors
+ * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * so that they are distributed as good as possible around the CPUs. If
+ * more vectors than CPUs are available we'll map one to each CPU,
+ * otherwise we map one to the first sibling of each socket.
+ *
+ * If there are more vectors than CPUs we will still only have one bit
+ * set per CPU, but interrupt code will keep on assigning the vectors from
+ * the start of the bitmap until we run out of vectors.
+ */
+struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+{
+ struct cpumask *affinity_mask;
+ unsigned int max_vecs = *nr_vecs;
+
+ if (max_vecs == 1)
+ return NULL;
+
+ affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!affinity_mask) {
+ *nr_vecs = 1;
+ return NULL;
+ }
+
+ get_online_cpus();
+ if (max_vecs >= num_online_cpus()) {
+ cpumask_copy(affinity_mask, cpu_online_mask);
+ *nr_vecs = num_online_cpus();
+ } else {
+ unsigned int vecs = 0, cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu == get_first_sibling(cpu)) {
+ cpumask_set_cpu(cpu, affinity_mask);
+ vecs++;
+ }
+
+ if (--max_vecs == 0)
+ break;
+ }
+ *nr_vecs = vecs;
+ }
+ put_online_cpus();
+
+ return affinity_mask;
+}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e7..26ba5654d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -426,6 +426,49 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/**
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Untracked interrupts are sent from a demultiplexing interrupt
+ * handler when the demultiplexer does not know which device it its
+ * multiplexed irq domain generated the interrupt. IRQ's handled
+ * through here are not subjected to stats tracking, randomness, or
+ * spurious interrupt detection.
+ *
+ * Note: Like handle_simple_irq, the caller is expected to handle
+ * the ack, clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+ unsigned int flags = 0;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
+
+ __handle_irq_event_percpu(desc, &flags);
+
+ raw_spin_lock(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
+
/*
* Called unconditionally from handle_level_irq() and only for oneshot
* interrupts from handle_fasteoi_irq()
@@ -777,6 +820,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
+ unsigned int type = irqd_get_trigger_type(&desc->irq_data);
+
+ /*
+ * We're about to start this interrupt immediately,
+ * hence the need to set the trigger configuration.
+ * But the .set_type callback may have overridden the
+ * flow handler, ignoring that we're dealing with a
+ * chained interrupt. Reset it immediately because we
+ * do know better.
+ */
+ if (type != IRQ_TYPE_NONE) {
+ __irq_set_trigger(desc, type);
+ desc->handle_irq = handle;
+ }
+
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
@@ -1093,3 +1151,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+ int retval;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
+ retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (retval < 0) {
+ pm_runtime_put_noidle(data->chip->parent_device);
+ return retval;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+ int retval = 0;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
+ retval = pm_runtime_put(data->chip->parent_device);
+
+ return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b..d3f249058 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int flags = 0, irq = desc->irq_data.irq;
+ unsigned int irq = desc->irq_data.irq;
struct irqaction *action;
for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
/* Fall through to add to randomness */
case IRQ_HANDLED:
- flags |= action->flags;
+ *flags |= action->flags;
break;
default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval |= res;
}
- add_interrupt_randomness(irq, flags);
+ return retval;
+}
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+ irqreturn_t retval;
+ unsigned int flags = 0;
+
+ retval = __handle_irq_event_percpu(desc, &flags);
+
+ add_interrupt_randomness(desc->irq_data.irq, flags);
if (!noirqdebug)
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903..bc226e783 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
*/
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 89b49f677..1a9abc1c8 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,
}
}
- virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+ virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc descs\n");
return -ENOMEM;
}
virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
- (void *) dest, true);
+ (void *) dest, true, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d..a623b44f2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
return 0;
}
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+ const struct cpumask *affinity)
{
- cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
+ if (!affinity)
+ affinity = irq_default_affinity;
+ cpumask_copy(desc->irq_common_data.affinity, affinity);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
int cpu;
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->owner = owner;
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
- desc_smp_init(desc, node);
+ desc_smp_init(desc, node, affinity);
}
int nr_irqs = NR_IRQS;
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void)
mutex_unlock(&sparse_irq_lock);
}
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
{
struct irq_desc *desc;
gfp_t gfp = GFP_KERNEL;
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_rcu_head(&desc->rcu);
- desc_set_defaults(irq, desc, node, owner);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
return desc;
@@ -223,13 +231,32 @@ static void free_desc(unsigned int irq)
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
+ const struct cpumask *mask = NULL;
struct irq_desc *desc;
- int i;
+ unsigned int flags;
+ int i, cpu = -1;
+
+ if (affinity && cpumask_empty(affinity))
+ return -EINVAL;
+
+ flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
for (i = 0; i < cnt; i++) {
- desc = alloc_desc(start + i, node, owner);
+ if (affinity) {
+ cpu = cpumask_next(cpu, affinity);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(affinity);
+ node = cpu_to_node(cpu);
+
+ /*
+ * For single allocations we use the caller provided
+ * mask otherwise we use the mask of the target cpu
+ */
+ mask = cnt == 1 ? affinity : cpumask_of(cpu);
+ }
+ desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
mutex_lock(&sparse_irq_lock);
@@ -277,7 +304,7 @@ int __init early_irq_init(void)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
- desc = alloc_desc(i, node, NULL);
+ desc = alloc_desc(i, node, 0, NULL, NULL);
set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
@@ -311,7 +338,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], GFP_KERNEL, node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- desc_set_defaults(i, &desc[i], node, NULL);
+ desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
}
@@ -328,11 +355,12 @@ static void free_desc(unsigned int irq)
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ const struct cpumask *affinity,
struct module *owner)
{
u32 i;
@@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask which hints where the
+ * irq descriptors should be allocated and which default
+ * affinities to use
*
* Returns the first irq number or error code
*/
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner)
+ struct module *owner, const struct cpumask *affinity)
{
int start, ret;
@@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, owner);
+ return alloc_descs(start, cnt, node, affinity, owner);
err:
mutex_unlock(&sparse_irq_lock);
@@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_alloc_hwirqs(int cnt, int node)
{
- int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
if (irq < 0)
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e..4752b4366 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
+ struct irq_data *irq_data;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
@@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (irq_domain_translate(domain, fwspec, &hwirq, &type))
return 0;
- if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * WARN if the irqchip returns a type with bits
+ * outside the sense mask set and clear these bits.
+ */
+ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+ type &= IRQ_TYPE_SENSE_MASK;
+
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
+ /*
+ * If the trigger type is not specified or matches the
+ * current trigger type then we are done so return the
+ * interrupt number.
+ */
+ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+ return virq;
+
/*
- * If we've already configured this interrupt,
- * don't do it again, or hell will break loose.
+ * If the trigger type has not been set yet, then set
+ * it now and return the interrupt number.
*/
- virq = irq_find_mapping(domain, hwirq);
- if (virq)
+ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data)
+ return 0;
+
+ irqd_set_trigger_type(irq_data, type);
return virq;
+ }
+ pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+ hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+ return 0;
+ }
+
+ if (irq_domain_is_hierarchy(domain)) {
virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
if (virq <= 0)
return 0;
@@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
return virq;
}
- /* Set type if specified and different than the current one */
- if (type != IRQ_TYPE_NONE &&
- type != irq_get_trigger_type(virq))
- irq_set_irq_type(virq, type);
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data) {
+ if (irq_domain_is_hierarchy(domain))
+ irq_domain_free_irqs(virq, 1);
+ else
+ irq_dispose_mapping(virq);
+ return 0;
+ }
+
+ /* Store trigger type */
+ irqd_set_trigger_type(irq_data, type);
+
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
@@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)
if (WARN_ON(domain == NULL))
return;
- irq_domain_disassociate(domain, virq);
- irq_free_desc(virq);
+ if (irq_domain_is_hierarchy(domain)) {
+ irq_domain_free_irqs(virq, 1);
+ } else {
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+ }
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
- int node)
+ int node, const struct cpumask *affinity)
{
unsigned int hint;
if (virq >= 0) {
- virq = irq_alloc_descs(virq, virq, cnt, node);
+ virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+ affinity);
} else {
hint = hwirq % nr_irqs;
if (hint == 0)
hint++;
- virq = irq_alloc_descs_from(hint, cnt, node);
- if (virq <= 0 && hint > 1)
- virq = irq_alloc_descs_from(1, cnt, node);
+ virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
+ affinity);
+ if (virq <= 0 && hint > 1) {
+ virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+ affinity);
+ }
}
return virq;
@@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
if (recursive)
ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
nr_irqs, arg);
- if (ret >= 0)
- ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
if (ret < 0 && recursive)
irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
@@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
* @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
*
* Allocate IRQ numbers and initialized all data structures to support
* hierarchy IRQ domains.
@@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
*/
int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
- bool realloc)
+ bool realloc, const struct cpumask *affinity)
{
int i, ret, virq;
@@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
- virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+ affinity);
if (virq < 0) {
pr_debug("cannot allocate IRQ(base %d, count %d)\n",
irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3..9530fcd27 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
{
if (!desc || !irqd_can_balance(&desc->irq_data) ||
!desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
- return 1;
+ return false;
+ return true;
}
/**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
}
/**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq: Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return __irq_can_set_affinity(desc) &&
+ !irqd_affinity_is_managed(&desc->irq_data);
+}
+
+/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
* @desc: irq descriptor which has affitnity changed
*
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
return 0;
/*
- * Preserve an userspace affinity setup, but make sure that
- * one of the targets is online.
+ * Preserve the managed affinity setting and an userspace affinity
+ * setup, but make sure that one of the targets is online.
*/
- if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (irqd_affinity_is_managed(&desc->irq_data) ||
+ irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
if (cpumask_intersects(desc->irq_common_data.affinity,
cpu_online_mask))
set = desc->irq_common_data.affinity;
@@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
/*
+ * If the trigger type is not specified by the caller,
+ * then use the default for this interrupt.
+ */
+ if (!(new->flags & IRQF_TRIGGER_MASK))
+ new->flags |= irqd_get_trigger_type(&desc->irq_data);
+
+ /*
* Check whether the interrupt nests into another interrupt
* thread.
*/
@@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
EXPORT_SYMBOL_GPL(setup_irq);
@@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
return action;
@@ -1648,11 +1680,18 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action->secondary);
kfree(action);
}
@@ -1730,7 +1769,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (!desc)
return;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
if (type != IRQ_TYPE_NONE) {
int ret;
@@ -1822,6 +1868,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
unregister_handler_proc(irq, action);
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
@@ -1884,10 +1931,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
@@ -1931,12 +1986,20 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->percpu_dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action);
+ }
return retval;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6143b2f64..19e9dfbe9 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -334,7 +334,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
- dev_to_node(dev), &arg, false);
+ dev_to_node(dev), &arg, false,
+ desc->affinity);
if (virq < 0) {
ret = -ENOSPC;
if (ops->handle_error)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726..feaa813b8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
cpumask_var_t new_value;
int err;
- if (!irq_can_set_affinity(irq) || no_irq_affinity)
+ if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
!name_unique(irq, action))
return;
- memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
if (desc->dir)
goto out_unlock;
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#endif
remove_proc_entry("spurious", desc->dir);
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
@@ -421,12 +418,8 @@ void init_irq_proc(void)
/*
* Create entries for all existing IRQs.
*/
- for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
+ for_each_irq_desc(irq, desc)
register_irq_proc(irq, desc);
- }
}
#ifdef CONFIG_GENERIC_IRQ_SHOW
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4b353e0be..93ad6c1fb 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
#ifdef HAVE_JUMP_LABEL
@@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
static void jump_label_update(struct static_key *key);
+/*
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for HAVE_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+ /*
+ * -1 means the first static_key_slow_inc() is in progress.
+ * static_key_enabled() must return true, so return 1 here.
+ */
+ int n = atomic_read(&key->enabled);
+
+ return n >= 0 ? n : 1;
+}
+EXPORT_SYMBOL_GPL(static_key_count);
+
+void static_key_enable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (!count)
+ static_key_slow_inc(key);
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (count)
+ static_key_slow_dec(key);
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
void static_key_slow_inc(struct static_key *key)
{
int v, v1;
@@ -235,6 +279,18 @@ void __init jump_label_init(void)
struct static_key *key = NULL;
struct jump_entry *iter;
+ /*
+ * Since we are initializing the static_key.enabled field with
+ * with the 'raw' int values (to avoid pulling in atomic.h) in
+ * jump_label.h, let's make sure that is safe. There are only two
+ * cases to check since we initialize to 0 or 1.
+ */
+ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+ BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
+ if (static_key_initialized)
+ return;
+
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
@@ -284,11 +340,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ preempt_disable();
mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ preempt_enable();
+
if (!mod)
return 0;
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
return __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
@@ -452,7 +511,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
return notifier_from_errno(ret);
}
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
.priority = 1, /* higher than tracepoints */
};
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4384672d3..980936a90 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (kexec_on_panic) {
/* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+ (entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 56b3ed092..561675589 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p)
return 0;
}
+int kexec_crash_loaded(void)
+{
+ return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
/*
* When kexec transitions to the new kernel there is a one-to-one
* mapping between physical and virtual addresses. On processors
@@ -140,6 +146,7 @@ int kexec_should_crash(struct task_struct *p)
* allocating pages whose destination address we do not care about.
*/
#define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
@@ -147,8 +154,9 @@ static struct page *kimage_alloc_page(struct kimage *image,
int sanity_check_segment_list(struct kimage *image)
{
- int result, i;
+ int i;
unsigned long nr_segments = image->nr_segments;
+ unsigned long total_pages = 0;
/*
* Verify we have good destination addresses. The caller is
@@ -163,16 +171,17 @@ int sanity_check_segment_list(struct kimage *image)
* simply because addresses are changed to page size
* granularity.
*/
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
+ if (mstart > mend)
+ return -EADDRNOTAVAIL;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
+ return -EADDRNOTAVAIL;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
+ return -EADDRNOTAVAIL;
}
/* Verify our destination addresses do not overlap.
@@ -180,7 +189,6 @@ int sanity_check_segment_list(struct kimage *image)
* through very weird things can happen with no
* easy explanation as one segment stops on another.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
unsigned long j;
@@ -194,7 +202,7 @@ int sanity_check_segment_list(struct kimage *image)
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- return result;
+ return -EINVAL;
}
}
@@ -203,12 +211,26 @@ int sanity_check_segment_list(struct kimage *image)
* and it is easier to check up front than to be surprised
* later on.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
+ return -EINVAL;
+ }
+
+ /*
+ * Verify that no more than half of memory will be consumed. If the
+ * request from userspace is too large, a large amount of time will be
+ * wasted allocating pages, which can cause a soft lockup.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+ return -EINVAL;
+
+ total_pages += PAGE_COUNT(image->segment[i].memsz);
}
+ if (total_pages > totalram_pages / 2)
+ return -EINVAL;
+
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -220,16 +242,15 @@ int sanity_check_segment_list(struct kimage *image)
*/
if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
+ if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+ (mend > phys_to_boot_phys(crashk_res.end)))
+ return -EADDRNOTAVAIL;
}
}
@@ -352,7 +373,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
if (!pages)
break;
- pfn = page_to_pfn(pages);
+ pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
eaddr = epfn << PAGE_SHIFT;
@@ -478,7 +499,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
return -ENOMEM;
ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
image->entry = ind_page;
image->last_entry = ind_page +
((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -533,13 +554,13 @@ void kimage_terminate(struct kimage *image)
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
static void kimage_free_entry(kimage_entry_t entry)
{
struct page *page;
- page = pfn_to_page(entry >> PAGE_SHIFT);
+ page = boot_pfn_to_page(entry >> PAGE_SHIFT);
kimage_free_pages(page);
}
@@ -633,7 +654,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
* have a match.
*/
list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
if (addr == destination) {
list_del(&page->lru);
return page;
@@ -648,12 +669,12 @@ static struct page *kimage_alloc_page(struct kimage *image,
if (!page)
return NULL;
/* If the page cannot be used file it away */
- if (page_to_pfn(page) >
+ if (page_to_boot_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
list_add(&page->lru, &image->unusable_pages);
continue;
}
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
/* If it is the destination page we want use it */
if (addr == destination)
@@ -676,7 +697,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
struct page *old_page;
old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
copy_highpage(page, old_page);
*old = addr | (*old & ~PAGE_MASK);
@@ -732,7 +753,7 @@ static int kimage_load_normal_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
- result = kimage_add_page(image, page_to_pfn(page)
+ result = kimage_add_page(image, page_to_boot_pfn(page)
<< PAGE_SHIFT);
if (result < 0)
goto out;
@@ -793,7 +814,7 @@ static int kimage_load_crash_segment(struct kimage *image,
char *ptr;
size_t uchunk, mchunk;
- page = pfn_to_page(maddr >> PAGE_SHIFT);
+ page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
if (!page) {
result = -ENOMEM;
goto out;
@@ -921,7 +942,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1374,7 +1395,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __weak paddr_vmcoreinfo_note(void)
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 152da4a48..ee1bc1bb8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_crash_image);
+ return sprintf(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
@@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size);
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx %x\n",
- paddr_vmcoreinfo_note(),
+ phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+ return sprintf(buf, "%pa %x\n", &vmcore_base,
(unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 12d8a8f88..9ff173dca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -275,7 +275,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
struct kthread_create_info *create = kmalloc(sizeof(*create),
- GFP_KERNEL | ___GFP_TOI_NOTRACK);
+ GFP_KERNEL);
if (!create)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5c2bc1052..8bbe50704 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod,
break;
}
- module_enable_ro(pmod);
+ module_enable_ro(pmod, true);
return ret;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107..589d763a4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
#include <linux/gfp.h>
#include <linux/kmemcheck.h>
#include <linux/random.h>
+#include <linux/jhash.h>
#include <asm/sections.h>
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
* It's a 64-bit hash, because it's important for the keys to be
* unique.
*/
-#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
- (key2))
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+ u32 k0 = key, k1 = key >> 32;
+
+ __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+ return k0 | (u64)k1 << 32;
+}
void lockdep_off(void)
{
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index d06ae3bb4..57a871ae3 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#define spin_lock_mutex(lock, flags) \
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index a68bae5e8..6cd6b8e9e 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -17,14 +17,20 @@
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * The mutex owner can get read and written to locklessly.
+ * We should use WRITE_ONCE when writing the owner value to
+ * avoid store tearing, otherwise, a thread could potentially
+ * read a partially written and incomplete owner value.
+ */
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#else
static inline void mutex_set_owner(struct mutex *lock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338..19248ddf3 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
rspin_until_writer_unlock(lock, cnts);
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5fc8c311b..b2caec731 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
* therefore increment the cpu number by one.
*/
-static inline u32 encode_tail(int cpu, int idx)
+static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
return tail;
}
-static inline struct mcs_spinlock *decode_tail(u32 tail)
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
{
int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -268,6 +268,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#endif
/*
+ * Various notes on spin_is_locked() and spin_unlock_wait(), which are
+ * 'interesting' functions:
+ *
+ * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
+ * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
+ * PPC). Also qspinlock has a similar issue per construction, the setting of
+ * the locked byte can be unordered acquiring the lock proper.
+ *
+ * This gets to be 'interesting' in the following cases, where the /should/s
+ * end up false because of this issue.
+ *
+ *
+ * CASE 1:
+ *
+ * So the spin_is_locked() correctness issue comes from something like:
+ *
+ * CPU0 CPU1
+ *
+ * global_lock(); local_lock(i)
+ * spin_lock(&G) spin_lock(&L[i])
+ * for (i) if (!spin_is_locked(&G)) {
+ * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
+ * return;
+ * }
+ * // deal with fail
+ *
+ * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
+ * that there is exclusion between the two critical sections.
+ *
+ * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
+ * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
+ * /should/ be constrained by the ACQUIRE from spin_lock(&G).
+ *
+ * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
+ *
+ *
+ * CASE 2:
+ *
+ * For spin_unlock_wait() there is a second correctness issue, namely:
+ *
+ * CPU0 CPU1
+ *
+ * flag = set;
+ * smp_mb(); spin_lock(&l)
+ * spin_unlock_wait(&l); if (!flag)
+ * // add to lockless list
+ * spin_unlock(&l);
+ * // iterate lockless list
+ *
+ * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
+ * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
+ * semantics etc..)
+ *
+ * Where flag /should/ be ordered against the locked store of l.
+ */
+
+/*
* queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
* issuing an _unordered_ store to set _Q_LOCKED_VAL.
*
@@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock)
cpu_relax();
done:
- smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+ smp_acquire__after_ctrl_dep();
}
EXPORT_SYMBOL(queued_spin_unlock_wait);
#endif
@@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
- smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -455,6 +512,8 @@ queue:
* pending stuff.
*
* p,*,* -> n,*,*
+ *
+ * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -465,6 +524,15 @@ queue:
*/
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
+ /*
+ * The above xchg_tail() is also a load of @lock which generates,
+ * through decode_tail(), a pointer.
+ *
+ * The address dependency matches the RELEASE of xchg_tail()
+ * such that the access to @prev must happen after.
+ */
+ smp_read_barrier_depends();
+
WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
@@ -494,7 +562,7 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * smp_cond_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -505,7 +573,7 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+ val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
@@ -525,9 +593,9 @@ locked:
break;
}
/*
- * The smp_cond_acquire() call above has provided the necessary
- * acquire semantics required for locking. At most two
- * iterations of this loop may be ran.
+ * The smp_cond_load_acquire() call above has provided the
+ * necessary acquire semantics required for locking. At most
+ * two iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
@@ -551,7 +619,7 @@ release:
/*
* release the node
*/
- this_cpu_dec(mcs_nodes[0].count);
+ __this_cpu_dec(mcs_nodes[0].count);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f6..8a99abf58 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
- atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_or(_Q_PENDING_VAL, &lock->val);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
- atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
}
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_halted);
+ WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309..b9d031516 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
- stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607a..1ec0f4896 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
- if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
return 0;
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c622..447e08de1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
* - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
* - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
* - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
*/
static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+__rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
{
struct rwsem_waiter *waiter;
struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
*/
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
+ }
goto out;
}
@@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
if (wake_type != RWSEM_WAKE_READ_OWNED) {
adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
+ /*
+ * If the count is still less than RWSEM_WAITING_BIAS
+ * after removing the adjustment, it is assumed that
+ * a writer has stolen the lock. We have to undo our
+ * reader grant.
+ */
+ if (atomic_long_add_return(-adjustment, &sem->count) <
+ RWSEM_WAITING_BIAS)
goto out;
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
+ /*
+ * It is not really necessary to set it to reader-owned here,
+ * but it gives the spinners an early indication that the
+ * readers now have the lock.
+ */
+ rwsem_set_reader_owned(sem);
}
/* Grant an infinite number of read locks to the readers at the front
@@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
adjustment -= RWSEM_WAITING_BIAS;
if (adjustment)
- rwsem_atomic_add(adjustment, sem);
+ atomic_long_add(adjustment, &sem->count);
next = sem->wait_list.next;
loop = woken;
@@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+
+ wake_q_add(wake_q, tsk);
/*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
+ * Ensure that the last operation is setting the reader
+ * waiter to nil such that rwsem_down_read_failed() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
*/
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
+ smp_store_release(&waiter->task, NULL);
} while (--loop);
sem->wait_list.next = next;
@@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ WAKE_Q(wake_q);
/* set up my own style of waitqueue */
waiter.task = tsk;
waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
@@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ count = atomic_long_add_return(adjustment, &sem->count);
/* If there are no active locks, wake the front queued process(es).
*
@@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
if (count == RWSEM_WAITING_BIAS ||
(count > RWSEM_WAITING_BIAS &&
adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
/* wait to be given the lock */
while (true) {
@@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
/*
- * Try acquiring the write lock. Check count first in order
- * to reduce unnecessary expensive cmpxchg() operations.
+ * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
*/
- if (count == RWSEM_WAITING_BIAS &&
- cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ if (count != RWSEM_WAITING_BIAS)
+ return false;
+
+ /*
+ * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+ * are other tasks on the wait list, we need to add on WAITING_BIAS.
+ */
+ count = list_is_singular(&sem->wait_list) ?
+ RWSEM_ACTIVE_WRITE_BIAS :
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+ if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+ == RWSEM_WAITING_BIAS) {
rwsem_set_owner(sem);
return true;
}
@@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = READ_ONCE(sem->count);
+ long old, count = atomic_long_read(&sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
- old = cmpxchg_acquire(&sem->count, count,
+ old = atomic_long_cmpxchg_acquire(&sem->count, count,
count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count) {
rwsem_set_owner(sem);
@@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!owner) {
- long count = READ_ONCE(sem->count);
+ if (!rwsem_owner_is_writer(owner)) {
/*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath with the lock being active, then there is a possibility
- * reader(s) may have the lock. To be safe, bail spinning in these
- * situations.
+ * Don't spin if the rwsem is readers owned.
*/
- if (count & RWSEM_ACTIVE_MASK)
- ret = false;
+ ret = !rwsem_owner_is_reader(owner);
goto done;
}
@@ -325,10 +350,15 @@ done:
return ret;
}
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- long count;
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ if (!rwsem_owner_is_writer(owner))
+ goto out;
rcu_read_lock();
while (sem->owner == owner) {
@@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
cpu_relax_lowlatency();
}
rcu_read_unlock();
-
- if (READ_ONCE(sem->owner))
- return true; /* new owner, continue spinning */
-
+out:
/*
- * When the owner is not set, the lock could be free or
- * held by readers. Check the counter to verify the
- * state.
+ * If there is a new owner or the owner is not set, we continue
+ * spinning.
*/
- count = READ_ONCE(sem->count);
- return (count == 0 || count == RWSEM_WAITING_BIAS);
+ return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
- struct task_struct *owner;
bool taken = false;
preempt_disable();
@@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (!osq_lock(&sem->osq))
goto done;
- while (true) {
- owner = READ_ONCE(sem->owner);
- if (owner && !rwsem_spin_on_owner(sem, owner))
- break;
-
- /* wait_lock will be acquired if write_lock is obtained */
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock as we can't determine if they are
+ * actively running or not.
+ */
+ while (rwsem_spin_on_owner(sem)) {
+ /*
+ * Try to acquire the lock
+ */
if (rwsem_try_write_lock_unqueued(sem)) {
taken = true;
break;
@@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(current)))
+ if (!sem->owner && (need_resched() || rt_task(current)))
break;
/*
@@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
+ WAKE_Q(wake_q);
/* undo write bias from down_write operation, stop active locking */
- count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
/* do optimistic spinning and steal lock if possible */
if (rwsem_optimistic_spin(sem))
@@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = READ_ONCE(sem->count);
+ count = atomic_long_read(&sem->count);
/*
* If there were already threads queued before us and there are
* no active writers, the lock must be read owned; so we try to
* wake any read locks that were queued ahead of us.
*/
- if (count > RWSEM_WAITING_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ if (count > RWSEM_WAITING_BIAS) {
+ WAKE_Q(wake_q);
+
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ /*
+ * The wakeup is normally called _after_ the wait_lock
+ * is released, but given that we are proactively waking
+ * readers we can deal with the wake_q overhead as it is
+ * similar to releasing and taking the wait_lock again
+ * for attempting rwsem_try_write_lock().
+ */
+ wake_up_q(&wake_q);
+ }
} else
- count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
/* wait until we successfully acquire the lock */
set_current_state(state);
@@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
schedule();
set_current_state(state);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+ } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
@@ -507,10 +548,11 @@ out_nolock:
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
if (list_empty(&sem->wait_list))
- rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
else
- __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
return ERR_PTR(-EINTR);
}
@@ -537,6 +579,7 @@ __visible
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +616,10 @@ locked:
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
@@ -590,14 +634,16 @@ __visible
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->wait_lock, flags);
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2e853ad93..45ba475d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_reader_owned(sem);
+ }
return ret;
}
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
- rwsem_clear_owner(sem);
+ rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b..a699f4048 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,14 +1,58 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ * 1) 0
+ * - lock is free or the owner hasn't set the field yet
+ * 2) RWSEM_READER_OWNED
+ * - lock is currently or previously owned by readers (lock is free
+ * or not set by owner yet)
+ * 3) Other non-zero value
+ * - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
- sem->owner = current;
+ WRITE_ONCE(sem->owner, current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
- sem->owner = NULL;
+ WRITE_ONCE(sem->owner, NULL);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * We check the owner value first to make sure that we will only
+ * do a write to the rwsem cacheline when it is really necessary
+ * to minimize cacheline contention.
+ */
+ if (sem->owner != RWSEM_READER_OWNED)
+ WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+}
+
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+ return owner && owner != RWSEM_READER_OWNED;
}
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+ return owner == RWSEM_READER_OWNED;
+}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
#endif
diff --git a/kernel/memremap.c b/kernel/memremap.c
index c2eb3a057..b501e390b 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
- return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -310,12 +304,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
- dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
- __func__);
- return ERR_PTR(-ENXIO);
- }
-
if (!ref)
return ERR_PTR(-EINVAL);
@@ -410,7 +398,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
{
/*
@@ -436,5 +423,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 6458a2f17..529efae9f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,6 +60,7 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -264,7 +265,7 @@ static void module_assert_mutex_or_preempt(void)
if (unlikely(!debug_locks))
return;
- WARN_ON(!rcu_read_lock_sched_held() &&
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
!lockdep_is_held(&module_mutex));
#endif
}
@@ -336,7 +337,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
* A thread that wants to hold a reference to a module only while it
* is running can call this to safely exit. nfsd and lockd use this.
*/
-void __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_exit(struct module *mod, long code)
{
module_put(mod);
do_exit(code);
@@ -1693,8 +1694,7 @@ static int module_add_modinfo_attrs(struct module *mod)
temp_attr = mod->modinfo_attrs;
for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
- if (!attr->test ||
- (attr->test && attr->test(mod))) {
+ if (!attr->test || attr->test(mod)) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
error = sysfs_create_file(&mod->mkobj.kobj,
@@ -1858,10 +1858,11 @@ static void mod_sysfs_teardown(struct module *mod)
* from modification and any data from execution.
*
* General layout of module is:
- * [text] [read-only-data] [writable data]
- * text_size -----^ ^ ^
- * ro_size ------------------------| |
- * size -------------------------------------------|
+ * [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^ ^ ^ ^
+ * ro_size ------------------------| | |
+ * ro_after_init_size -----------------------------| |
+ * size -----------------------------------------------------------|
*
* These values are always page-aligned (as is base)
*/
@@ -1884,14 +1885,24 @@ static void frob_rodata(const struct module_layout *layout,
(layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
+static void frob_ro_after_init(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
static void frob_writable_data(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->size - layout->ro_size) >> PAGE_SHIFT);
+ set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+ (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
}
/* livepatching wants to disable read-only so it can frob module. */
@@ -1899,21 +1910,26 @@ void module_disable_ro(const struct module *mod)
{
frob_text(&mod->core_layout, set_memory_rw);
frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_ro_after_init(&mod->core_layout, set_memory_rw);
frob_text(&mod->init_layout, set_memory_rw);
frob_rodata(&mod->init_layout, set_memory_rw);
}
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
{
frob_text(&mod->core_layout, set_memory_ro);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
frob_rodata(&mod->init_layout, set_memory_ro);
+
+ if (after_init)
+ frob_ro_after_init(&mod->core_layout, set_memory_ro);
}
static void module_enable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_ro_after_init(&mod->core_layout, set_memory_nx);
frob_writable_data(&mod->core_layout, set_memory_nx);
frob_rodata(&mod->init_layout, set_memory_nx);
frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1922,6 +1938,7 @@ static void module_enable_nx(const struct module *mod)
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
+ frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1964,6 +1981,8 @@ static void disable_ro_nx(const struct module_layout *layout)
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_rodata(layout, set_memory_x);
+ frob_ro_after_init(layout, set_memory_rw);
+ frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
@@ -2306,6 +2325,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
* finder in the two loops below */
{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
};
@@ -2337,7 +2357,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->core_layout.size = debug_align(mod->core_layout.size);
mod->core_layout.ro_size = mod->core_layout.size;
break;
- case 3: /* whole core */
+ case 2: /* RO after init */
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_after_init_size = mod->core_layout.size;
+ break;
+ case 4: /* whole core */
mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
@@ -2367,7 +2391,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->init_layout.size = debug_align(mod->init_layout.size);
mod->init_layout.ro_size = mod->init_layout.size;
break;
- case 3: /* whole init */
+ case 2:
+ /*
+ * RO after init doesn't apply to init_layout (only
+ * core_layout), so it just takes the value of ro_size.
+ */
+ mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+ break;
+ case 4: /* whole init */
mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
@@ -2925,8 +2956,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree"))
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3095,6 +3130,8 @@ static int move_module(struct module *mod, struct load_info *info)
static int check_module_license_and_versions(struct module *mod)
{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -3113,6 +3150,9 @@ static int check_module_license_and_versions(struct module *mod)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -3160,16 +3200,41 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
+ unsigned int ndx;
int err;
mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
+ if (blacklisted(mod->name))
+ return ERR_PTR(-EPERM);
+
err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -3183,6 +3248,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ /*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ ndx = find_sec(info, ".data..ro_after_init");
+ if (ndx)
+ info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
@@ -3349,12 +3423,14 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
+ module_enable_ro(mod, true);
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
mod->init_layout.ro_size = 0;
+ mod->init_layout.ro_after_init_size = 0;
mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
@@ -3446,8 +3522,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions */
- module_enable_ro(mod);
+ module_enable_ro(mod, false);
module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
diff --git a/kernel/panic.c b/kernel/panic.c
index 8aa74497c..ca8cea1ef 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,6 +108,7 @@ void panic(const char *fmt, ...)
long i, i_next = 0;
int state = 0;
int old_cpu, this_cpu;
+ bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
@@ -160,7 +161,7 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!crash_kexec_post_notifiers) {
+ if (!_crash_kexec_post_notifiers) {
printk_nmi_flush_on_panic();
__crash_kexec(NULL);
}
@@ -191,7 +192,7 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (crash_kexec_post_notifiers)
+ if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
bust_spinlocks(0);
@@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
-
-static int __init setup_crash_kexec_post_notifiers(char *s)
-{
- crash_kexec_post_notifiers = true;
- return 0;
-}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
static int __init oops_setup(char *s)
{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5f93a3ccd..68d3ebc12 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -101,284 +101,6 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
-menuconfig TOI_CORE
- bool "Enhanced Hibernation (TuxOnIce)"
- depends on HIBERNATION
- default y
- ---help---
- TuxOnIce is the 'new and improved' suspend support.
-
- See the TuxOnIce home page (tuxonice.net)
- for FAQs, HOWTOs and other documentation.
-
- comment "Image Storage (you need at least one allocator)"
- depends on TOI_CORE
-
- config TOI_FILE
- bool "File Allocator"
- depends on TOI_CORE
- default y
- ---help---
- This option enables support for storing an image in a
- simple file. You might want this if your swap is
- sometimes full enough that you don't have enough spare
- space to store an image.
-
- config TOI_SWAP
- bool "Swap Allocator"
- depends on TOI_CORE && SWAP
- default y
- ---help---
- This option enables support for storing an image in your
- swap space.
-
- comment "General Options"
- depends on TOI_CORE
-
- config TOI_PRUNE
- bool "Image pruning support"
- depends on TOI_CORE && CRYPTO && BROKEN
- default y
- ---help---
- This option adds support for using cryptoapi hashing
- algorithms to identify pages with the same content. We
- then write a much smaller pointer to the first copy of
- the data instead of a complete (perhaps compressed)
- additional copy.
-
- You probably want this, so say Y here.
-
- comment "No image pruning support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_CRYPTO
- bool "Compression support"
- depends on TOI_CORE && CRYPTO
- default y
- ---help---
- This option adds support for using cryptoapi compression
- algorithms. Compression is particularly useful as it can
- more than double your suspend and resume speed (depending
- upon how well your image compresses).
-
- You probably want this, so say Y here.
-
- comment "No compression support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_USERUI
- bool "Userspace User Interface support"
- depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
- default y
- ---help---
- This option enabled support for a userspace based user interface
- to TuxOnIce, which allows you to have a nice display while suspending
- and resuming, and also enables features such as pressing escape to
- cancel a cycle or interactive debugging.
-
- config TOI_USERUI_DEFAULT_PATH
- string "Default userui program location"
- default "/usr/local/sbin/tuxoniceui_text"
- depends on TOI_USERUI
- ---help---
- This entry allows you to specify a default path to the userui binary.
-
- config TOI_DEFAULT_IMAGE_SIZE_LIMIT
- int "Default image size limit"
- range -2 65536
- default "-2"
- depends on TOI_CORE
- ---help---
- This entry allows you to specify a default image size limit. It can
- be overridden at run-time using /sys/power/tuxonice/image_size_limit.
-
- config TOI_KEEP_IMAGE
- bool "Allow Keep Image Mode"
- depends on TOI_CORE
- ---help---
- This option allows you to keep and image and reuse it. It is intended
- __ONLY__ for use with systems where all filesystems are mounted read-
- only (kiosks, for example). To use it, compile this option in and boot
- normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
- When you resume, the image will not be removed. You will be unable to turn
- off swap partitions (assuming you are using the swap allocator), but future
- suspends simply do a power-down. The image can be updated using the
- kernel command line parameter suspend_act= to turn off the keep image
- bit. Keep image mode is a little less user friendly on purpose - it
- should not be used without thought!
-
- config TOI_INCREMENTAL
- bool "Incremental Image Support"
- depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
- default n
- ---help---
- This option enables the work in progress toward using the dirty page
- tracking to record changes to pages. It is hoped that
- this will be an initial step toward implementing storing just
- the differences between consecutive images, which will
- increase the amount of storage needed for the image, but also
- increase the speed at which writing an image occurs and
- reduce the wear and tear on drives.
-
- At the moment, all that is implemented is the first step of keeping
- an existing image and then comparing it to the contents in memory
- (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
- (fake) resume) to see what the page change tracking should find to be
- different. If you have verify_image set to 1, TuxOnIce will automatically
- invalidate the old image when you next try to hibernate, so there's no
- greater chance of disk corruption than normal.
-
- comment "No incremental image support available without Keep Image support."
- depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
-
- config TOI_REPLACE_SWSUSP
- bool "Replace swsusp by default"
- default y
- depends on TOI_CORE
- ---help---
- TuxOnIce can replace swsusp. This option makes that the default state,
- requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
- to use the vanilla kernel functionality. Note that your initrd/ramfs will
- need to do this before trying to resume, too.
- With overriding swsusp enabled, echoing disk to /sys/power/state will
- start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
- the swap and file allocators are compiled in, the swap allocator will be
- used by default.
-
- config TOI_IGNORE_LATE_INITCALL
- bool "Wait for initrd/ramfs to run, by default"
- default n
- depends on TOI_CORE
- ---help---
- When booting, TuxOnIce can check for an image and start to resume prior
- to any initrd/ramfs running (via a late initcall).
-
- If you don't have an initrd/ramfs, this is what you want to happen -
- otherwise you won't be able to safely resume. You should set this option
- to 'No'.
-
- If, however, you want your initrd/ramfs to run anyway before resuming,
- you need to tell TuxOnIce to ignore that earlier opportunity to resume.
- This can be done either by using this compile time option, or by
- overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
-
- Note that if TuxOnIce can't resume at the earlier opportunity, the
- value of this option won't matter - the initramfs/initrd (if any) will
- run anyway.
-
- menuconfig TOI_CLUSTER
- bool "Cluster support"
- default n
- depends on TOI_CORE && NET && BROKEN
- ---help---
- Support for linking multiple machines in a cluster so that they suspend
- and resume together.
-
- config TOI_DEFAULT_CLUSTER_INTERFACE
- string "Default cluster interface"
- depends on TOI_CLUSTER
- ---help---
- The default interface on which to communicate with other nodes in
- the cluster.
-
- If no value is set here, cluster support will be disabled by default.
-
- config TOI_DEFAULT_CLUSTER_KEY
- string "Default cluster key"
- default "Default"
- depends on TOI_CLUSTER
- ---help---
- The default key used by this node. All nodes in the same cluster
- have the same key. Multiple clusters may coexist on the same lan
- by using different values for this key.
-
- config TOI_CLUSTER_IMAGE_TIMEOUT
- int "Timeout when checking for image"
- default 15
- depends on TOI_CLUSTER
- ---help---
- Timeout (seconds) before continuing to boot when waiting to see
- whether other nodes might have an image. Set to -1 to wait
- indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
- booting sooner than this timeout.
-
- config TOI_CLUSTER_WAIT_UNTIL_NODES
- int "Nodes without image before continuing"
- default 0
- depends on TOI_CLUSTER
- ---help---
- When booting and no image is found, we wait to see if other nodes
- have an image before continuing to boot. This value lets us
- continue after seeing a certain number of nodes without an image,
- instead of continuing to wait for the timeout. Set to 0 to only
- use the timeout.
-
- config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
- string "Default pre-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called when starting to hibernate.
-
- config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
- string "Default post-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called after resuming from hibernation.
-
- config TOI_DEFAULT_WAIT
- int "Default waiting time for emergency boot messages"
- default "25"
- range -1 32768
- depends on TOI_CORE
- help
- TuxOnIce can display warnings very early in the process of resuming,
- if (for example) it appears that you have booted a kernel that doesn't
- match an image on disk. It can then give you the opportunity to either
- continue booting that kernel, or reboot the machine. This option can be
- used to control how long to wait in such circumstances. -1 means wait
- forever. 0 means don't wait at all (do the default action, which will
- generally be to continue booting and remove the image). Values of 1 or
- more indicate a number of seconds (up to 255) to wait before doing the
- default.
-
- config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
- int "Default extra pages allowance"
- default "2000"
- range 500 32768
- depends on TOI_CORE
- help
- This value controls the default for the allowance TuxOnIce makes for
- drivers to allocate extra memory during the atomic copy. The default
- value of 2000 will be okay in most cases. If you are using
- DRI, the easiest way to find what value to use is to try to hibernate
- and look at how many pages were actually needed in the sysfs entry
- /sys/power/tuxonice/debug_info (first number on the last line), adding
- a little extra because the value is not always the same.
-
- config TOI_CHECKSUM
- bool "Checksum pageset2"
- default n
- depends on TOI_CORE
- select CRYPTO
- select CRYPTO_ALGAPI
- select CRYPTO_MD4
- ---help---
- Adds support for checksumming pageset2 pages, to ensure you really get an
- atomic copy. Since some filesystems (XFS especially) change metadata even
- when there's no other activity, we need this to check for pages that have
- been changed while we were saving the page cache. If your debugging output
- always says no pages were resaved, you may be able to safely disable this
- option.
-
-config TOI
- bool
- depends on TOI_CORE!=n
- default y
-
-config TOI_ZRAM_SUPPORT
- def_bool y
- depends on TOI && ZRAM!=n
-
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 82c4795e8..eb4f71770 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,37 +1,7 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
-tuxonice_core-y := tuxonice_modules.o
-
-obj-$(CONFIG_TOI) += tuxonice_builtin.o
-obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \
- tuxonice_copy_before_write.o
-
-tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o
-
-# Compile these in after allocation debugging, if used.
-
-tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
- tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
- tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
- tuxonice_power_off.o tuxonice_atomic_copy.o
-
-tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o
-
-tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o
-
-obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
-obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o
-obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
-
-tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
- tuxonice_bio_signature.o
-
-obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o
-obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o
-obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o
-
-obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o
+KASAN_SANITIZE_snapshot.o := n
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a..0e781798b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
return ret;
}
-int pm_prepare_console(void)
+void pm_prepare_console(void)
{
if (!pm_vt_switch())
- return 0;
+ return;
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
- return 1;
+ return;
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
- return 0;
+ return;
}
void pm_restore_console(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2444206e9..33c79b610 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
#include <linux/ktime.h>
#include <trace/events/power.h>
-#include "tuxonice.h"
+#include "power.h"
static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
static int nohibernate;
static int resume_wait;
static unsigned int resume_delay;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
@@ -52,6 +52,7 @@ enum {
#ifdef CONFIG_SUSPEND
HIBERNATION_SUSPEND,
#endif
+ HIBERNATION_TEST_RESUME,
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -123,7 +124,7 @@ static int hibernation_test(int level) { return 0; }
* platform_begin - Call platform to start hibernation.
* @platform_mode: Whether or not to use the platform driver.
*/
-int platform_begin(int platform_mode)
+static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->begin() : 0;
@@ -133,7 +134,7 @@ int platform_begin(int platform_mode)
* platform_end - Call platform to finish transition to the working state.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_end(int platform_mode)
+static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->end();
@@ -147,7 +148,7 @@ void platform_end(int platform_mode)
* if so configured, and return an error code if that fails.
*/
-int platform_pre_snapshot(int platform_mode)
+static int platform_pre_snapshot(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +163,7 @@ int platform_pre_snapshot(int platform_mode)
*
* This routine is called on one CPU with interrupts disabled.
*/
-void platform_leave(int platform_mode)
+static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->leave();
@@ -177,7 +178,7 @@ void platform_leave(int platform_mode)
*
* This routine must be called after platform_prepare().
*/
-void platform_finish(int platform_mode)
+static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->finish();
@@ -193,7 +194,7 @@ void platform_finish(int platform_mode)
* If the restore fails after this function has been called,
* platform_restore_cleanup() must be called.
*/
-int platform_pre_restore(int platform_mode)
+static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_restore() : 0;
@@ -210,7 +211,7 @@ int platform_pre_restore(int platform_mode)
* function must be called too, regardless of the result of
* platform_pre_restore().
*/
-void platform_restore_cleanup(int platform_mode)
+static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->restore_cleanup();
@@ -220,7 +221,7 @@ void platform_restore_cleanup(int platform_mode)
* platform_recover - Recover from a failure to suspend devices.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_recover(int platform_mode)
+static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
hibernation_ops->recover();
@@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode)
goto Close;
}
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+ return disable_nonboot_cpus();
+}
+
/**
* resume_target_kernel - Restore system state from a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
- error = disable_nonboot_cpus();
+ error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -642,15 +648,39 @@ static void power_down(void)
cpu_relax();
}
+static int load_image_and_restore(void)
+{
+ int error;
+ unsigned int flags;
+
+ pr_debug("PM: Loading hibernation image.\n");
+
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Unlock;
+
+ error = swsusp_read(&flags);
+ swsusp_close(FMODE_READ);
+ if (!error)
+ hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ Unlock:
+ unlock_device_hotplug();
+
+ return error;
+}
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
- int error;
-
- if (test_action_state(TOI_REPLACE_SWSUSP))
- return try_tuxonice_hibernate();
+ int error, nr_calls = 0;
+ bool snapshot_test = false;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
@@ -665,9 +695,11 @@ int hibernate(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Exit;
+ }
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -700,8 +732,12 @@ int hibernate(void)
pr_debug("PM: writing image.\n");
error = swsusp_write(flags);
swsusp_free();
- if (!error)
- power_down();
+ if (!error) {
+ if (hibernation_mode == HIBERNATION_TEST_RESUME)
+ snapshot_test = true;
+ else
+ power_down();
+ }
in_suspend = 0;
pm_restore_gfp_mask();
} else {
@@ -712,12 +748,18 @@ int hibernate(void)
free_basic_memory_bitmaps();
Thaw:
unlock_device_hotplug();
+ if (snapshot_test) {
+ pr_debug("PM: Checking hibernation image\n");
+ error = swsusp_check();
+ if (!error)
+ error = load_image_and_restore();
+ }
thaw_processes();
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
@@ -741,18 +783,9 @@ int hibernate(void)
* attempts to recover gracefully and make the kernel return to the normal mode
* of operation.
*/
-int software_resume(void)
+static int software_resume(void)
{
- int error;
- unsigned int flags;
-
- resume_attempted = 1;
-
- /*
- * We can't know (until an image header - if any - is loaded), whether
- * we did override swsusp. We therefore ensure that both are tried.
- */
- try_tuxonice_resume();
+ int error, nr_calls = 0;
/*
* If the user said "noresume".. bail out early.
@@ -838,35 +871,20 @@ int software_resume(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Close_Finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
-
- pr_debug("PM: Loading hibernation image.\n");
-
- lock_device_hotplug();
- error = create_basic_memory_bitmaps();
- if (error)
- goto Thaw;
-
- error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
- if (!error)
- hibernation_restore(flags & SF_PLATFORM_MODE);
-
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
- swsusp_free();
- free_basic_memory_bitmaps();
- Thaw:
- unlock_device_hotplug();
+ error = load_image_and_restore();
thaw_processes();
Finish:
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
@@ -889,6 +907,7 @@ static const char * const hibernation_modes[] = {
#ifdef CONFIG_SUSPEND
[HIBERNATION_SUSPEND] = "suspend",
#endif
+ [HIBERNATION_TEST_RESUME] = "test_resume",
};
/*
@@ -935,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -981,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -1126,13 +1147,16 @@ static int __init resume_offset_setup(char *str)
static int __init hibernate_setup(char *str)
{
- if (!strncmp(str, "noresume", 8))
+ if (!strncmp(str, "noresume", 8)) {
noresume = 1;
- else if (!strncmp(str, "nocompress", 10))
+ } else if (!strncmp(str, "nocompress", 10)) {
nocompress = 1;
- else if (!strncmp(str, "no", 2)) {
+ } else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
+ } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+ && !strncmp(str, "protect_image", 13)) {
+ enable_restore_image_protection();
}
return 1;
}
@@ -1140,7 +1164,6 @@ static int __init hibernate_setup(char *str)
static int __init noresume_setup(char *str)
{
noresume = 1;
- set_toi_state(TOI_NORESUME_SPECIFIED);
return 1;
}
@@ -1166,11 +1189,6 @@ static int __init nohibernate_setup(char *str)
return 1;
}
-static int __init kaslr_nohibernate_setup(char *str)
-{
- return nohibernate_setup(str);
-}
-
static int __init page_poison_nohibernate_setup(char *str)
{
#ifdef CONFIG_PAGE_POISONING_ZERO
@@ -1194,5 +1212,4 @@ __setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975e..5ea50b1b7 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+ nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return __pm_notifier_call_chain(val, -1, NULL);
+}
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bf4d922b2..242d8b827 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,12 +36,10 @@ static inline char *check_image_kernel(struct swsusp_info *info)
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
-#else
-extern char *check_image_kernel(struct swsusp_info *info);
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int init_header(struct swsusp_info *info);
-extern char resume_file[256];
+extern int hibernate_resume_nonboot_cpu_disable(void);
+
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -63,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
@@ -81,7 +86,6 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
-extern struct pbe *restore_pblist;
#define power_attr_ro(_name) \
static struct kobj_attribute _name##_attr = { \
.attr = { \
@@ -205,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+ int *nr_calls);
extern int pm_notifier_call_chain(unsigned long val);
#endif
@@ -274,31 +280,6 @@ static inline void suspend_thaw_processes(void)
}
#endif
-extern struct page *saveable_page(struct zone *z, unsigned long p);
-#ifdef CONFIG_HIGHMEM
-struct page *saveable_highmem_page(struct zone *z, unsigned long p);
-#else
-static
-inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
-#endif
-
-#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
-extern struct list_head nosave_regions;
-
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
- */
-
-struct nosave_region {
- struct list_head list;
- unsigned long start_pfn;
- unsigned long end_pfn;
-};
-
#ifdef CONFIG_PM_AUTOSLEEP
/* kernel/power/autosleep.c */
@@ -325,10 +306,3 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
-
-#ifdef CONFIG_TOI
-unsigned long toi_get_nonconflicting_page(void);
-#define BM_END_OF_MAP (~0UL)
-#else
-#define toi_get_nonconflicting_page() (0)
-#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0c2ee9761..8f27d5a8a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
+ if (wq_busy)
+ show_workqueue_state();
+
if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df713..168ff442e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- cancel_delayed_work_sync(&req->work);
+ /*
+ * This function may be called very early during boot, for example,
+ * from of_clk_init(), where irq needs to stay disabled.
+ * cancel_delayed_work_sync() assumes that irq is enabled on
+ * invocation and re-enables it on return. Avoid calling it until
+ * workqueue is initialized.
+ */
+ if (keventd_up())
+ cancel_delayed_work_sync(&req->work);
+
__pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index dc3bab15e..b02228411 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,11 +36,45 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
#include "power.h"
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+ hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+ hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+ hibernate_restore_protection_active = false;
+}
+
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_ro((unsigned long)page_address, 1);
+}
+
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
@@ -70,25 +104,32 @@ void __init hibernate_image_size_init(void)
image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
}
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
* directly to their "original" page frames.
*/
struct pbe *restore_pblist;
-/* Pointer to an auxiliary buffer (1 page) */
-static void *buffer;
+/* struct linked_page is used to build chains of pages */
-/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend. The unsafe pages have PageNosaveFree set
- * and we count them using unsafe_pages.
- *
- * Each allocated image page is marked as PageNosave and PageNosaveFree
- * so that swsusp_free() can release it.
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
*/
+static struct linked_page *safe_pages_list;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
#define PG_ANY 0
#define PG_SAFE 1
@@ -97,13 +138,23 @@ static void *buffer;
static unsigned int allocated_unsafe_pages;
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation. The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
- if (toi_running)
- return (void *) toi_get_nonconflicting_page();
-
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -119,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
return res;
}
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+ if (safe_pages_list) {
+ void *ret = safe_pages_list;
+
+ safe_pages_list = safe_pages_list->next;
+ memset(ret, 0, PAGE_SIZE);
+ return ret;
+ }
+ return get_image_page(gfp_mask, PG_SAFE);
+}
+
unsigned long get_safe_page(gfp_t gfp_mask)
{
- return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+ return (unsigned long)__get_safe_page(gfp_mask);
}
static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -136,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
return page;
}
+static void recycle_safe_page(void *page_address)
+{
+ struct linked_page *lp = page_address;
+
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+}
+
/**
- * free_image_page - free page represented by @addr, allocated with
- * get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
*/
-
static inline void free_image_page(void *addr, int clear_nosave_free)
{
struct page *page;
@@ -149,11 +223,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
page = virt_to_page(addr);
- if (toi_running) {
- toi__free_page(29, page);
- return;
- }
-
swsusp_unset_page_forbidden(page);
if (clear_nosave_free)
swsusp_unset_page_free(page);
@@ -161,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
__free_page(page);
}
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
- struct linked_page *next;
- char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+ int clear_page_nosave)
{
while (list) {
struct linked_page *lp = list->next;
@@ -181,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
}
}
-/**
- * struct chain_allocator is used for allocating small objects out of
- * a linked list of pages called 'the chain'.
- *
- * The chain grows each time when there is no room for a new object in
- * the current page. The allocated objects cannot be freed individually.
- * It is only possible to free them all at once, by freeing the entire
- * chain.
- *
- * NOTE: The chain allocator may be inefficient if the allocated objects
- * are not much smaller than PAGE_SIZE.
- */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
struct chain_allocator {
struct linked_page *chain; /* the chain */
unsigned int used_space; /* total size of objects allocated out
- * of the current page
- */
+ of the current page */
gfp_t gfp_mask; /* mask for allocating pages */
int safe_needed; /* if set, only "safe" pages are allocated */
};
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+ int safe_needed)
{
ca->chain = NULL;
ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -219,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
struct linked_page *lp;
- lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+ get_image_page(ca->gfp_mask, PG_ANY);
if (!lp)
return NULL;
@@ -233,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
}
/**
- * Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
- * objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
- * object there is a list of objects of type struct bm_block that
- * represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
*
- * struct memory_bitmap contains a pointer to the main list of zone
- * bitmap objects, a struct bm_position used for browsing the bitmap,
- * and a pointer to the list of pages used for allocating all of the
- * zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
*
- * NOTE: It has to be possible to lay out the bitmap in memory
- * using only allocations of order 0. Additionally, the bitmap is
- * designed to work with arbitrary number of zones (this is over the
- * top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
*
- * struct zone_bitmap contains a pointer to a list of bitmap block
- * objects and a pointer to the bitmap block object that has been
- * most recently used for setting bits. Additionally, it contains the
- * pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
*
- * struct bm_block contains a pointer to the memory page in which
- * information is stored (in the form of a block of bitmap)
- * It also contains the pfns that correspond to the start and end of
- * the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
*
- * The memory bitmap is organized as a radix tree to guarantee fast random
- * access to the bits. There is one radix tree for each zone (as returned
- * from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
*
- * One radix tree is represented by one struct mem_zone_bm_rtree. There are
- * two linked lists for the nodes of the tree, one for the inner nodes and
- * one for the leave nodes. The linked leave nodes are used for fast linear
- * access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
*
- * The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
@@ -313,15 +372,12 @@ struct bm_position {
int node_bit;
};
-#define BM_POSITION_SLOTS (NR_CPUS * 2)
-
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
- * bitmap objects and bitmap block
- * objects
- */
- struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */
+ bitmap objects and bitmap block
+ objects */
+ struct bm_position cur; /* most recently used bit position */
};
/* Functions that operate on memory bitmaps */
@@ -334,12 +390,12 @@ struct memory_bitmap {
#endif
#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
-/*
- * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
*
- * This function is used to allocate inner nodes as well as the
- * leave nodes of the radix tree. It also adds the node to the
- * corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
*/
static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
struct chain_allocator *ca,
@@ -360,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
return node;
}
-/*
- * add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
*
- * The leave nodes need to be allocated in order to keep the leaves
- * linked list in order. This is guaranteed by the zone->blocks
- * counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
int safe_needed, struct chain_allocator *ca)
@@ -430,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free);
-/*
- * create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
*
- * Allocated the mem_zone_bm_rtree structure and initializes it.
- * This function also allocated and builds the radix tree for the
- * zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
*/
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
- struct chain_allocator *ca,
- unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+ int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start,
+ unsigned long end)
{
struct mem_zone_bm_rtree *zone;
unsigned int i, nr_blocks;
@@ -467,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
return zone;
}
-/*
- * free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
*
- * Free all node pages of the radix tree. The mem_zone_bm_rtree
- * structure itself is not freed here nor are the rtree_node
- * structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
*/
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free)
@@ -486,39 +543,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
free_image_page(node->data, clear_nosave_free);
}
-void memory_bm_position_reset(struct memory_bitmap *bm)
+static void memory_bm_position_reset(struct memory_bitmap *bm)
{
- int index;
-
- for (index = 0; index < BM_POSITION_SLOTS; index++) {
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-
-/**
- * memory_bm_clear
- * @param bm - The bitmap to clear
- *
- * Only run while single threaded - locking not needed
- */
-void memory_bm_clear(struct memory_bitmap *bm)
-{
- memory_bm_position_reset(bm);
-
- while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
- memory_bm_clear_current(bm, 0);
- }
-
- memory_bm_position_reset(bm);
-}
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
struct mem_extent {
@@ -528,8 +562,8 @@ struct mem_extent {
};
/**
- * free_mem_extents - free a list of memory extents
- * @list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
*/
static void free_mem_extents(struct list_head *list)
{
@@ -542,10 +576,11 @@ static void free_mem_extents(struct list_head *list)
}
/**
- * create_mem_extents - create a list of memory extents representing
- * contiguous ranges of PFNs
- * @list - list to put the extents into
- * @gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
*/
static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
{
@@ -601,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
}
/**
- * memory_bm_create - allocate memory for a memory bitmap
- */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+ int safe_needed)
{
struct chain_allocator ca;
struct list_head mem_extents;
@@ -631,8 +666,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
bm->p_list = ca.chain;
-
- memory_bm_position_reset(bm);
+ memory_bm_position_reset(bm);
Exit:
free_mem_extents(&mem_extents);
return error;
@@ -644,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
/**
- * memory_bm_free - free memory occupied by the memory bitmap @bm
- */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct mem_zone_bm_rtree *zone;
@@ -659,33 +694,22 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_find_bit - Find the bit for pfn in the memory
- * bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
+ *
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
*
- * Find the bit in the bitmap @bm that corresponds to given pfn.
- * The cur.zone, cur.block and cur.node_pfn member of @bm are
- * updated.
- * It walks the radix tree to find the page which contains the bit for
- * pfn and returns the bit position in **addr and *bit_nr.
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
*/
-int memory_bm_find_bit(struct memory_bitmap *bm, int index,
- unsigned long pfn, void **addr, unsigned int *bit_nr)
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
{
struct mem_zone_bm_rtree *curr, *zone;
struct rtree_node *node;
int i, block_nr;
- if (!bm->cur[index].zone) {
- // Reset
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
- list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
- struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
-
- zone = bm->cur[index].zone;
+ zone = bm->cur.zone;
if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
goto zone_found;
@@ -705,12 +729,11 @@ int memory_bm_find_bit(struct memory_bitmap *bm, int index,
zone_found:
/*
- * We have a zone. Now walk the radix tree to find the leave
- * node for our pfn.
+ * We have found the zone. Now walk the radix tree to find the leaf node
+ * for our PFN.
*/
-
- node = bm->cur[index].node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
@@ -727,9 +750,9 @@ zone_found:
node_found:
/* Update last position */
- bm->cur[index].zone = zone;
- bm->cur[index].node = node;
- bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
/* Set return values */
*addr = node->data;
@@ -738,97 +761,97 @@ node_found:
return 0;
}
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
set_bit(bit, addr);
}
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
return error;
}
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
clear_bit(bit, addr);
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+static void memory_bm_clear_current(struct memory_bitmap *bm)
{
int bit;
- bit = max(bm->cur[index].node_bit - 1, 0);
- clear_bit(bit, bm->cur[index].node->data);
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
}
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
return test_bit(bit, addr);
}
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
- return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
/*
- * rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
*
- * Sets the position to the beginning of the next node in the
- * memory bitmap. This is either the next node in the current
- * zone's radix tree or the first node in the radix tree of the
- * next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
*/
-static bool rtree_next_node(struct memory_bitmap *bm, int index)
+static bool rtree_next_node(struct memory_bitmap *bm)
{
- if (!list_is_last(&bm->cur[index].node->list, &bm->cur[index].zone->leaves)) {
- bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
struct rtree_node, list);
- bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
- bm->cur[index].node_bit = 0;
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
touch_softlockup_watchdog();
return true;
}
/* No more nodes, goto next zone */
- if (!list_is_last(&bm->cur[index].zone->list, &bm->zones)) {
- bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
return true;
}
@@ -837,48 +860,84 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
*
- * Starting from the last returned position this function searches
- * for the next set bit in the memory bitmap and returns its
- * number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it. If no more bits are
+ * set, BM_END_OF_MAP is returned.
*
- * It is required to run memory_bm_position_reset() before the
- * first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
*/
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
unsigned long bits, pfn, pages;
int bit;
- index += NR_CPUS; /* Iteration state is separated from get/set/test */
-
do {
- pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
- bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
- bit = find_next_bit(bm->cur[index].node->data, bits,
- bm->cur[index].node_bit);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
if (bit < bits) {
- pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
- bm->cur[index].node_bit = bit + 1;
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
return pfn;
}
- } while (rtree_next_node(bm, index));
+ } while (rtree_next_node(bm));
return BM_END_OF_MAP;
}
-LIST_HEAD(nosave_regions);
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
+ */
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
+
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ recycle_safe_page(node->data);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+ struct mem_zone_bm_rtree *zone;
+ struct linked_page *p_list;
+
+ list_for_each_entry(zone, &bm->zones, list)
+ recycle_zone_bm_rtree(zone);
+
+ p_list = bm->p_list;
+ while (p_list) {
+ struct linked_page *lp = p_list;
+
+ p_list = lp->next;
+ recycle_safe_page(lp);
+ }
+}
/**
- * register_nosave_region - register a range of page frames the contents
- * of which should not be saved during the suspend (to be used in the early
- * initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
*/
-
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
- int use_kmalloc)
+void __init __register_nosave_region(unsigned long start_pfn,
+ unsigned long end_pfn, int use_kmalloc)
{
struct nosave_region *region;
@@ -895,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
}
}
if (use_kmalloc) {
- /* during init, this shouldn't fail */
+ /* During init, this shouldn't fail */
region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
BUG_ON(!region);
- } else
+ } else {
/* This allocation cannot fail */
region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ }
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -927,44 +987,46 @@ static struct memory_bitmap *free_pages_map;
void swsusp_set_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(free_pages_map, page_to_pfn(page));
}
static int swsusp_page_is_free(struct page *page)
{
return free_pages_map ?
- memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
}
void swsusp_unset_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
}
static void swsusp_set_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
}
int swsusp_page_is_forbidden(struct page *page)
{
return forbidden_pages_map ?
- memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
}
static void swsusp_unset_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
}
/**
- * mark_nosave_pages - set bits corresponding to the page frames the
- * contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
*/
-
static void mark_nosave_pages(struct memory_bitmap *bm)
{
struct nosave_region *region;
@@ -988,19 +1050,19 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
* touch the PFNs for which the error is
* returned anyway.
*/
- mem_bm_set_bit_check(bm, 0, pfn);
+ mem_bm_set_bit_check(bm, pfn);
}
}
}
/**
- * create_basic_memory_bitmaps - create bitmaps needed for marking page
- * frames that should not be saved and free page frames. The pointers
- * forbidden_pages_map and free_pages_map are only modified if everything
- * goes well, because we don't want the bits to be used before both bitmaps
- * are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames. The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
*/
-
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1045,12 +1107,12 @@ int create_basic_memory_bitmaps(void)
}
/**
- * free_basic_memory_bitmaps - free memory bitmaps allocated by
- * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
- * so that the bitmaps themselves are not referred to while they are being
- * freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
*/
-
void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1071,11 +1133,13 @@ void free_basic_memory_bitmaps(void)
}
/**
- * snapshot_additional_pages - estimate the number of additional pages
- * be needed for setting up the suspend image data structures for given
- * zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
*/
-
unsigned int snapshot_additional_pages(struct zone *zone)
{
unsigned int rtree, nodes;
@@ -1093,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
#ifdef CONFIG_HIGHMEM
/**
- * count_free_highmem_pages - compute the total number of free highmem
- * pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
*/
-
static unsigned int count_free_highmem_pages(void)
{
struct zone *zone;
@@ -1110,13 +1174,14 @@ static unsigned int count_free_highmem_pages(void)
}
/**
- * saveable_highmem_page - Determine whether a highmem page should be
- * included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
*
- * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- * and it isn't a part of a free chunk of pages.
+ * Determine whether a highmem page should be included in a hibernation image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
*/
-struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1140,10 +1205,8 @@ struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_highmem_pages - compute the total number of saveable highmem
- * pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
*/
-
static unsigned int count_highmem_pages(void)
{
struct zone *zone;
@@ -1163,17 +1226,24 @@ static unsigned int count_highmem_pages(void)
}
return n;
}
+#else
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+ return NULL;
+}
#endif /* CONFIG_HIGHMEM */
/**
- * saveable_page - Determine whether a non-highmem page should be included
- * in the suspend image.
+ * saveable_page - Check if the given page is saveable.
+ *
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
*
- * We should save the page if it isn't Nosave, and is not in the range
- * of pages statically defined as 'unsaveable', and it isn't a part of
- * a free chunk of pages.
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
*/
-struct page *saveable_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1200,10 +1270,8 @@ struct page *saveable_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_data_pages - compute the total number of saveable non-highmem
- * pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
*/
-
static unsigned int count_data_pages(void)
{
struct zone *zone;
@@ -1223,7 +1291,8 @@ static unsigned int count_data_pages(void)
return n;
}
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
* task structs.
*/
static inline void do_copy_page(long *dst, long *src)
@@ -1234,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src)
*dst++ = *src++;
}
-
/**
- * safe_copy_page - check if the page we are going to copy is marked as
- * present in the kernel page tables (this always is the case if
- * CONFIG_DEBUG_PAGEALLOC is not set and in that case
- * kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
*/
static void safe_copy_page(void *dst, struct page *s_page)
{
@@ -1252,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page)
}
}
-
#ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
{
return is_highmem(zone) ?
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1276,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
kunmap_atomic(src);
} else {
if (PageHighMem(d_page)) {
- /* Page pointed to by src may contain some kernel
+ /*
+ * The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
safe_copy_page(buffer, s_page);
@@ -1298,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
}
#endif /* CONFIG_HIGHMEM */
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+static void copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm)
{
struct zone *zone;
unsigned long pfn;
@@ -1311,15 +1379,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (page_is_saveable(zone, pfn))
- memory_bm_set_bit(orig_bm, 0, pfn);
+ memory_bm_set_bit(orig_bm, pfn);
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
for(;;) {
- pfn = memory_bm_next_pfn(orig_bm, 0);
+ pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+ copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
}
}
@@ -1348,12 +1416,11 @@ static struct memory_bitmap orig_bm;
static struct memory_bitmap copy_bm;
/**
- * swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
*
- * Suspend pages are alocated before the atomic copy is made, so we
- * need to release them after the resume.
+ * Image pages are alocated before snapshot creation, so they need to be
+ * released after resume.
*/
-
void swsusp_free(void)
{
unsigned long fb_pfn, fr_pfn;
@@ -1365,8 +1432,8 @@ void swsusp_free(void)
memory_bm_position_reset(free_pages_map);
loop:
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
/*
* Find the next bit set in both bitmaps. This is guaranteed to
@@ -1374,16 +1441,17 @@ loop:
*/
do {
if (fb_pfn < fr_pfn)
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
if (fr_pfn < fb_pfn)
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
} while (fb_pfn != fr_pfn);
if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
struct page *page = pfn_to_page(fr_pfn);
- memory_bm_clear_current(forbidden_pages_map, 0);
- memory_bm_clear_current(free_pages_map, 0);
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ hibernate_restore_unprotect_page(page_address(page));
__free_page(page);
goto loop;
}
@@ -1395,6 +1463,7 @@ out:
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernate_restore_protection_end();
}
/* Helper functions used for the shrinking of memory. */
@@ -1402,7 +1471,7 @@ out:
#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
/**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
* @nr_pages: Number of page frames to allocate.
* @mask: GFP flags to use for the allocation.
*
@@ -1418,7 +1487,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
page = alloc_image_page(mask);
if (!page)
break;
- memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(&copy_bm, page_to_pfn(page));
if (PageHighMem(page))
alloc_highmem++;
else
@@ -1452,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
/**
- * __fraction - Compute (an approximation of) x * (multiplier / base)
+ * __fraction - Compute (an approximation of) x * (multiplier / base).
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
@@ -1462,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
unsigned long alloc = __fraction(nr_pages, highmem, total);
@@ -1476,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
return 0;
}
#endif /* CONFIG_HIGHMEM */
/**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
*/
static unsigned long free_unnecessary_pages(void)
{
@@ -1514,7 +1583,7 @@ static unsigned long free_unnecessary_pages(void)
memory_bm_position_reset(&copy_bm);
while (to_free_normal > 0 || to_free_highmem > 0) {
- unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm);
struct page *page = pfn_to_page(pfn);
if (PageHighMem(page)) {
@@ -1528,7 +1597,7 @@ static unsigned long free_unnecessary_pages(void)
to_free_normal--;
alloc_normal--;
}
- memory_bm_clear_bit(&copy_bm, 0, pfn);
+ memory_bm_clear_bit(&copy_bm, pfn);
swsusp_unset_page_forbidden(page);
swsusp_unset_page_free(page);
__free_page(page);
@@ -1538,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void)
}
/**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
* @saveable: Number of saveable pages in the system.
*
* We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1558,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable)
unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE)
- + global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_FILE)
- - global_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_ACTIVE_ANON)
+ + global_node_page_state(NR_INACTIVE_ANON)
+ + global_node_page_state(NR_ACTIVE_FILE)
+ + global_node_page_state(NR_INACTIVE_FILE)
+ - global_node_page_state(NR_FILE_MAPPED);
return saveable <= size ? 0 : saveable - size;
}
/**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
*
* To create a hibernation image it is necessary to make a copy of every page
* frame in use. We also need a number of page frames to be free during
@@ -1741,10 +1810,11 @@ int hibernate_preallocate_memory(void)
#ifdef CONFIG_HIGHMEM
/**
- * count_pages_for_highmem - compute the number of non-highmem pages
- * that will be necessary for creating copies of highmem pages.
- */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
{
unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1757,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
return nr_highmem;
}
#else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * enough_free_mem - Make sure we have enough free memory for the
- * snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
*/
-
static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
{
struct zone *zone;
@@ -1784,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
#ifdef CONFIG_HIGHMEM
/**
- * get_highmem_buffer - if there are some highmem pages in the suspend
- * image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
*/
-
static inline int get_highmem_buffer(int safe_needed)
{
buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1795,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - allocate some highmem pages for the image.
- * Try to allocate as many pages as needed, but if the number of free
- * highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
*/
-
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int nr_highmem)
{
unsigned int to_alloc = count_free_highmem_pages();
@@ -1813,32 +1881,31 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
struct page *page;
page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
}
#else
static inline int get_highmem_buffer(int safe_needed) { return 0; }
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int n) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
*
- * We first try to allocate as many highmem pages as there are
- * saveable highmem pages in the system. If that fails, we allocate
- * non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
*
- * In this approach it is likely that the copies of highmem pages will
- * also be located in the high memory, because of the way in which
- * copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
*/
-
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
- unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
+ struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
{
if (nr_highmem > 0) {
if (get_highmem_buffer(PG_ANY))
@@ -1856,7 +1923,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
if (!page)
goto err_out;
- memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
}
@@ -1871,9 +1938,6 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- if (toi_running)
- return toi_post_context_save();
-
printk(KERN_INFO "PM: Creating hibernation image:\n");
drain_local_pages(NULL);
@@ -1891,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void)
return -ENOMEM;
}
- /* During allocating of suspend pagedir, new cold pages may appear.
+ /*
+ * During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages(NULL);
@@ -1921,7 +1986,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-char *check_image_kernel(struct swsusp_info *info)
+static char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -1942,7 +2007,7 @@ unsigned long snapshot_get_image_size(void)
return nr_copy_pages + nr_meta_pages + 1;
}
-int init_header(struct swsusp_info *info)
+static int init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = get_num_physpages();
@@ -1954,17 +2019,19 @@ int init_header(struct swsusp_info *info)
}
/**
- * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- * are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
*/
-
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
- buf[j] = memory_bm_next_pfn(bm, 0);
+ buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
/* Save page key for data page (s390 only). */
@@ -1973,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
}
/**
- * snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to read up to the returned number of bytes from the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the end of data stream condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors. If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
*/
-
int snapshot_read_next(struct snapshot_handle *handle)
{
if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -2015,9 +2081,10 @@ int snapshot_read_next(struct snapshot_handle *handle)
} else {
struct page *page;
- page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
- /* Highmem pages are copied to the buffer,
+ /*
+ * Highmem pages are copied to the buffer,
* because we can't return with a kmapped
* highmem page (we may not be called again).
*/
@@ -2035,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
return PAGE_SIZE;
}
-/**
- * mark_unsafe_pages - mark the pages that cannot be used for storing
- * the image during resume, because they conflict with the pages that
- * had been used before suspend
- */
-
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+ struct memory_bitmap *src)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn;
- /* Clear page flags */
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn))
- swsusp_unset_page_free(pfn_to_page(pfn));
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
-
- /* Mark pages that correspond to the "original" pfns as "unsafe" */
- memory_bm_position_reset(bm);
- do {
- pfn = memory_bm_next_pfn(bm, 0);
- if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
- swsusp_set_page_free(pfn_to_page(pfn));
- else
- return -EFAULT;
- }
- } while (pfn != BM_END_OF_MAP);
-
- allocated_unsafe_pages = 0;
-
- return 0;
}
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+/**
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
+ */
+static void mark_unsafe_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
- memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src, 0);
+ /* Clear the "free"/"unsafe" bit for all PFNs */
+ memory_bm_position_reset(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, 0, pfn);
- pfn = memory_bm_next_pfn(src, 0);
+ memory_bm_clear_current(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
}
+
+ /* Mark pages that correspond to the "original" PFNs as "unsafe" */
+ duplicate_memory_bitmap(free_pages_map, bm);
+
+ allocated_unsafe_pages = 0;
}
static int check_header(struct swsusp_info *info)
@@ -2099,11 +2154,9 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
*/
-
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
{
int error;
@@ -2117,8 +2170,12 @@ load_header(struct swsusp_info *info)
}
/**
- * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- * the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
*/
static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
@@ -2131,8 +2188,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
/* Extract and buffer page key for data page (s390 only). */
page_key_memorize(buf + j);
- if (memory_bm_pfn_present(bm, 0, buf[j]))
- memory_bm_set_bit(bm, 0, buf[j]);
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
+ memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
}
@@ -2140,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
return 0;
}
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
#ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
* should be restored atomically during the resume from disk, because the page
* frames they have occupied before the suspend are in use.
*/
@@ -2156,7 +2209,8 @@ struct highmem_pbe {
struct highmem_pbe *next;
};
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
* allocated before the suspend and included in the suspend image, but have
* also been allocated by the "resume" kernel, so their contents cannot be
* written directly to their "original" page frames.
@@ -2164,45 +2218,46 @@ struct highmem_pbe {
static struct highmem_pbe *highmem_pblist;
/**
- * count_highmem_image_pages - compute the number of highmem pages in the
- * suspend image. The bits in the memory bitmap @bm that correspond to the
- * image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
*/
-
static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
unsigned int cnt = 0;
memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
while (pfn != BM_END_OF_MAP) {
if (PageHighMem(pfn_to_page(pfn)))
cnt++;
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
}
return cnt;
}
-/**
- * prepare_highmem_image - try to allocate as many highmem pages as
- * there are highmem image pages (@nr_highmem_p points to the variable
- * containing the number of highmem image pages). The pages that are
- * "safe" (ie. will not be overwritten when the suspend image is
- * restored) have the corresponding bits set in @bm (it must be
- * unitialized).
- *
- * NOTE: This function should not be called if there are no highmem
- * image pages.
- */
-
static unsigned int safe_highmem_pages;
static struct memory_bitmap *safe_highmem_bm;
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages). The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
+static int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p)
{
unsigned int to_alloc;
@@ -2225,7 +2280,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
page = alloc_page(__GFP_HIGHMEM);
if (!swsusp_page_is_free(page)) {
/* The page is "safe", set its bit the bitmap */
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
safe_highmem_pages++;
}
/* Mark the page as allocated */
@@ -2237,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
return 0;
}
+static struct page *last_highmem_page;
+
/**
- * get_highmem_page_buffer - for given highmem image page find the buffer
- * that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
*
- * If the page is to be saved to its "original" page frame or a copy of
- * the page is to be made in the highmem, @buffer is returned. Otherwise,
- * the copy of the page is to be made in normal memory, so the address of
- * the copy is returned.
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
*
- * If @buffer is returned, the caller of suspend_write_next() will write
- * the page's contents to @buffer, so they will have to be copied to the
- * right location on the next call to suspend_write_next() and it is done
- * with the help of copy_last_highmem_page(). For this purpose, if
- * @buffer is returned, @last_highmem page is set to the page to which
- * the data will have to be copied from @buffer.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
*/
-
-static struct page *last_highmem_page;
-
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
struct highmem_pbe *pbe;
void *kaddr;
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
last_highmem_page = page;
return buffer;
}
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2283,7 +2341,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
/* Copy of the page will be stored in high memory */
kaddr = buffer;
- tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
safe_highmem_pages--;
last_highmem_page = tmp;
pbe->copy_page = tmp;
@@ -2299,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
}
/**
- * copy_last_highmem_page - copy the contents of a highmem image from
- * @buffer, where the caller of snapshot_write_next() has place them,
- * to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
*/
-
static void copy_last_highmem_page(void)
{
if (last_highmem_page) {
@@ -2330,17 +2389,13 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
- return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p) { return 0; }
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
return ERR_PTR(-EINVAL);
}
@@ -2350,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; }
static inline void free_highmem_data(void) {}
#endif /* CONFIG_HIGHMEM */
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
/**
- * prepare_image - use the memory bitmap @bm to mark the pages that will
- * be overwritten in the process of restoring the system memory state
- * from the suspend image ("unsafe" pages) and allocate memory for the
- * image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Unitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
*
- * The idea is to allocate a new memory bitmap first and then allocate
- * as many pages as needed for the image data, but not to assign these
- * pages to specific tasks initially. Instead, we just mark them as
- * allocated and create a lists of "safe" pages that will be used
- * later. On systems with high memory a list of "safe" highmem pages is
- * also created.
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet. Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later. On systems with high
+ * memory a list of "safe" highmem pages is created too.
*/
-
-#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
unsigned int nr_pages, nr_highmem;
- struct linked_page *sp_list, *lp;
+ struct linked_page *lp;
int error;
/* If there is no highmem, the buffer will not be necessary */
@@ -2378,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
buffer = NULL;
nr_highmem = count_highmem_image_pages(bm);
- error = mark_unsafe_pages(bm);
- if (error)
- goto Free;
+ mark_unsafe_pages(bm);
error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
if (error)
@@ -2393,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
if (error)
goto Free;
}
- /* Reserve some safe pages for potential later use.
+ /*
+ * Reserve some safe pages for potential later use.
*
* NOTE: This way we make sure there will be enough safe pages for the
* chain_alloc() in get_buffer(). It is a bit wasteful, but
* nr_copy_pages cannot be greater than 50% of the memory anyway.
+ *
+ * nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- sp_list = NULL;
- /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
@@ -2409,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
error = -ENOMEM;
goto Free;
}
- lp->next = sp_list;
- sp_list = lp;
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
nr_pages--;
}
/* Preallocate memory for the image */
- safe_pages_list = NULL;
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2432,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
swsusp_set_page_free(virt_to_page(lp));
nr_pages--;
}
- /* Free the reserved safe pages so that chain_alloc() can use them */
- while (sp_list) {
- lp = sp_list->next;
- free_image_page(sp_list, PG_UNSAFE_CLEAR);
- sp_list = lp;
- }
return 0;
Free:
@@ -2446,15 +2493,16 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
/**
- * get_buffer - compute the address that snapshot_write_next() should
- * set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
*/
-
static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
struct page *page;
- unsigned long pfn = memory_bm_next_pfn(bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(bm);
if (pfn == BM_END_OF_MAP)
return ERR_PTR(-EFAULT);
@@ -2464,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return get_highmem_page_buffer(page, ca);
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
return page_address(page);
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2486,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
}
/**
- * snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to write up to the returned number of bytes to the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the "end of file" condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the "end of file" condition. Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
*/
-
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
@@ -2527,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ safe_pages_list = NULL;
+
error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
@@ -2536,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
error = unpack_orig_pfns(buffer, &copy_bm);
if (error)
@@ -2558,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
+ hibernate_restore_protect_page(handle->buffer);
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2569,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle)
}
/**
- * snapshot_write_finalize - must be called after the last call to
- * snapshot_write_next() in case the last page in the image happens
- * to be a highmem page and its contents should be stored in the
- * highmem. Additionally, it releases the memory that will not be
- * used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem. Additionally, it recycles bitmap memory that's not
+ * necessary any more.
*/
-
void snapshot_write_finalize(struct snapshot_handle *handle)
{
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
page_key_free();
- /* Free only if we have loaded the image entirely */
+ hibernate_restore_protect_page(handle->buffer);
+ /* Do that only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
- memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ memory_bm_recycle(&orig_bm);
free_highmem_data();
}
}
@@ -2597,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
#ifdef CONFIG_HIGHMEM
/* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+ void *buf)
{
void *kaddr1, *kaddr2;
@@ -2612,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
}
/**
- * restore_highmem - for each highmem page that was allocated before
- * the suspend and included in the suspend image, and also has been
- * allocated by the "resume" kernel swap its current (ie. "before
- * resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
+ *
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
*
- * If the resume eventually fails, we can call this function once
- * again and restore the "before resume" highmem state.
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
*/
-
int restore_highmem(void)
{
struct highmem_pbe *pbe = highmem_pblist;
@@ -2641,82 +2695,3 @@ int restore_highmem(void)
return 0;
}
#endif /* CONFIG_HIGHMEM */
-
-struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
- *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
-
-int resume_attempted;
-
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
-
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_space_needed(struct memory_bitmap *bm)
-{
- unsigned long bytes = 0;
-
- memory_bm_position_reset(bm);
- do {
- bytes += PAGE_SIZE;
- } while (rtree_next_node(bm, 0));
- return bytes;
-}
-
-int toi_alloc_bitmap(struct memory_bitmap **bm)
-{
- int error;
- struct memory_bitmap *bm1;
-
- bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
- if (!bm1)
- return -ENOMEM;
-
- error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
- if (error) {
- printk("Error returned - %d.\n", error);
- kfree(bm1);
- return -ENOMEM;
- }
-
- *bm = bm1;
- return 0;
-}
-
-void toi_free_bitmap(struct memory_bitmap **bm)
-{
- if (!*bm)
- return;
-
- memory_bm_free(*bm, 0);
- kfree(*bm);
- *bm = NULL;
-}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b8..0acab9d7f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error;
+ int error, nr_calls = 0;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Finish;
+ }
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
+ __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
pm_restore_console();
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e10066..a3b1e617b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
+ bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio->bi_end_io = hib_end_io;
bio->bi_private = hb;
atomic_inc(&hb->count);
- submit_bio(rw, bio);
+ submit_bio(bio);
} else {
- error = submit_bio_wait(rw, bio);
+ error = submit_bio_wait(bio);
bio_put(bio);
}
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block, swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
error = -ENODEV;
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void)
if (res < 0)
blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ /*
+ * Update the resume device to the one actually used,
+ * so the test_resume mode can use it in case it is
+ * invoked from hibernate() to test the snapshot.
+ */
+ swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(WRITE_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+ tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(READ_SYNC, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1525,7 +1534,8 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1533,7 +1543,8 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e89114..35310b627 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available())
return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- }
+ } else
+ nr_calls--;
+
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a..d5760c42f 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a28..16bab471c 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -99,27 +99,33 @@ again:
return add;
}
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void printk_nmi_flush_line(const char *text, int len)
{
- const char *buf = s->buffer + start;
-
/*
* The buffers are flushed in NMI only on panic. The messages must
* go only into the ring buffer at this stage. Consoles will get
* explicitly called later when a crashdump is not generated.
*/
if (in_nmi())
- printk_deferred("%.*s", (end - start) + 1, buf);
+ printk_deferred("%.*s", len, text);
else
- printk("%.*s", (end - start) + 1, buf);
+ printk("%.*s", len, text);
}
/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+ int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk_nmi_flush_line(buf, (end - start) + 1);
+}
+
+/*
* Flush data from the associated per_CPU buffer. The function
* can be called either via IRQ work or independently.
*/
@@ -150,9 +156,11 @@ more:
* the buffer an unexpected way. If we printed something then
* @len must only increase.
*/
- if (i && i >= len)
- pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
- i, len);
+ if (i && i >= len) {
+ const char *msg = "printk_nmi_flush: internal error\n";
+
+ printk_nmi_flush_line(msg, strlen(msg));
+ }
if (!len)
goto out; /* Someone else has already flushed the buffer. */
@@ -166,14 +174,14 @@ more:
/* Print line by line. */
for (; i < size; i++) {
if (s->buffer[i] == '\n') {
- print_nmi_seq_line(s, last_i, i);
+ printk_nmi_flush_seq_line(s, last_i, i);
last_i = i + 1;
}
}
/* Check if there was a partial line. */
if (last_i < size) {
- print_nmi_seq_line(s, last_i, size - 1);
- pr_cont("\n");
+ printk_nmi_flush_seq_line(s, last_i, size - 1);
+ printk_nmi_flush_line("\n", strlen("\n"));
}
/*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3e8d47ee9..eea6dbc2d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -26,14 +26,12 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/suspend.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
#include <linux/ratelimit.h>
@@ -49,7 +47,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -87,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = {
};
#endif
+enum devkmsg_log_bits {
+ __DEVKMSG_LOG_BIT_ON = 0,
+ __DEVKMSG_LOG_BIT_OFF,
+ __DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+ DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON),
+ DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF),
+ DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT 0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strncmp(str, "on", 2)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_ON;
+ return 2;
+ } else if (!strncmp(str, "off", 3)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+ return 3;
+ } else if (!strncmp(str, "ratelimit", 9)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+ return 9;
+ }
+ return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+ if (__control_devkmsg(str) < 0)
+ return 1;
+
+ /*
+ * Set sysctl string accordingly:
+ */
+ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "on", 2);
+ } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "off", 3);
+ }
+ /* else "ratelimit" which is set by default. */
+
+ /*
+ * Sysctl cannot change it anymore. The kernel command line setting of
+ * this parameter is to force the setting to be permanent throughout the
+ * runtime of the system. This is a precation measure against userspace
+ * trying to be a smarta** and attempting to change it up on us.
+ */
+ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+ return 0;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char old_str[DEVKMSG_STR_MAX_SIZE];
+ unsigned int old;
+ int err;
+
+ if (write) {
+ if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+ return -EINVAL;
+
+ old = devkmsg_log;
+ strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+ }
+
+ err = proc_dostring(table, write, buffer, lenp, ppos);
+ if (err)
+ return err;
+
+ if (write) {
+ err = __control_devkmsg(devkmsg_log_str);
+
+ /*
+ * Do not accept an unknown string OR a known string with
+ * trailing crap...
+ */
+ if (err < 0 || (err + 1 != *lenp)) {
+
+ /* ... and restore old setting. */
+ devkmsg_log = old;
+ strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
/*
* Number of registered extended console drivers.
*
@@ -286,20 +389,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
-#ifdef CONFIG_TOI_INCREMENTAL
-void toi_set_logbuf_untracked(void)
-{
- int i;
- struct page *log_buf_start_page = virt_to_page(__log_buf);
-
- printk("Not protecting kernel printk log buffer (%p-%p).\n",
- __log_buf, __log_buf + __LOG_BUF_LEN);
-
- for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
- SetPageTOI_Untracked(log_buf_start_page + i);
-}
-#endif
-
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -629,6 +718,7 @@ struct devkmsg_user {
u64 seq;
u32 idx;
enum log_flags prev;
+ struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
};
@@ -638,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
char *buf, *line;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (!user || len > LOG_LINE_MAX)
return -EINVAL;
+
+ /* Ignore when user logging is disabled. */
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return len;
+
+ /* Ratelimit when not explicitly enabled. */
+ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
+ }
+
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -815,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file)
struct devkmsg_user *user;
int err;
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return -EPERM;
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ ratelimit_default_init(&user->rs);
+ ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
@@ -846,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
if (!user)
return 0;
+ ratelimit_state_exit(&user->rs);
+
mutex_destroy(&user->lock);
kfree(user);
return 0;
@@ -1001,6 +1111,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
"ignore loglevel setting (prints all kernel messages to the console)");
+static bool suppress_message_printing(int level)
+{
+ return (level >= console_loglevel && !ignore_loglevel);
+}
+
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
@@ -1030,7 +1145,7 @@ static void boot_delay_msec(int level)
unsigned long timeout;
if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
- || (level >= console_loglevel && !ignore_loglevel)) {
+ || suppress_message_printing(level)) {
return;
}
@@ -1454,8 +1569,6 @@ static void call_console_drivers(int level,
trace_console(text, len);
- if (level >= console_loglevel && !ignore_loglevel)
- return;
if (!console_drivers)
return;
@@ -1903,6 +2016,7 @@ static void call_console_drivers(int level,
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
/* Still needs to be defined for users */
DEFINE_PER_CPU(printk_func_t, printk_func);
@@ -2182,6 +2296,13 @@ static void console_cont_flush(char *text, size_t size)
if (!cont.len)
goto out;
+ if (suppress_message_printing(cont.level)) {
+ cont.cons = cont.len;
+ if (cont.flushed)
+ cont.len = 0;
+ goto out;
+ }
+
/*
* We still queue earlier records, likely because the console was
* busy. The earlier ones need to be printed before this one, we
@@ -2285,10 +2406,13 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
+ level = msg->level;
+ if ((msg->flags & LOG_NOCONS) ||
+ suppress_message_printing(level)) {
/*
* Skip record we have buffered and already printed
- * directly to the console when we received it.
+ * directly to the console when we received it, and
+ * record that has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
@@ -2302,7 +2426,6 @@ skip:
goto skip;
}
- level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
@@ -3192,9 +3315,8 @@ void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e990..2dbccf2d8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
put_cpu();
}
-static int profile_cpu_callback(struct notifier_block *info,
- unsigned long action, void *__cpu)
+static int profile_dead_cpu(unsigned int cpu)
{
- int node, cpu = (unsigned long)__cpu;
struct page *page;
+ int i;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_mem(cpu);
- per_cpu(cpu_profile_flip, cpu) = 0;
- if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- return notifier_from_errno(-ENOMEM);
- per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
- }
- if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- goto out_free;
- per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
- }
- break;
-out_free:
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- return notifier_from_errno(-ENOMEM);
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_set_cpu(cpu, prof_cpu_mask);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_clear_cpu(cpu, prof_cpu_mask);
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ if (prof_cpu_mask != NULL)
+ cpumask_clear_cpu(cpu, prof_cpu_mask);
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
+ per_cpu(cpu_profile_hits, cpu)[i] = NULL;
__free_page(page);
}
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
+ }
+ return 0;
+}
+
+static int profile_prepare_cpu(unsigned int cpu)
+{
+ int i, node = cpu_to_mem(cpu);
+ struct page *page;
+
+ per_cpu(cpu_profile_flip, cpu) = 0;
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i])
+ continue;
+
+ page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page) {
+ profile_dead_cpu(cpu);
+ return -ENOMEM;
}
- break;
+ per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
+
}
- return NOTIFY_OK;
+ return 0;
+}
+
+static int profile_online_cpu(unsigned int cpu)
+{
+ if (prof_cpu_mask != NULL)
+ cpumask_set_cpu(cpu, prof_cpu_mask);
+
+ return 0;
}
+
#else /* !CONFIG_SMP */
#define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0)
-#define profile_cpu_callback NULL
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
+int __ref create_proc_profile(void)
{
- int cpu;
-
- for_each_online_cpu(cpu) {
- int node = cpu_to_mem(cpu);
- struct page *page;
-
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[1]
- = (struct profile_hit *)page_address(page);
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[0]
- = (struct profile_hit *)page_address(page);
- }
- return 0;
-out_cleanup:
- prof_on = 0;
- smp_mb();
- on_each_cpu(profile_nop, NULL, 1);
- for_each_online_cpu(cpu) {
- struct page *page;
-
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
- __free_page(page);
- }
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- }
- }
- return -1;
-}
-#else
-#define create_hash_tables() ({ 0; })
+ struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
+ enum cpuhp_state online_state;
#endif
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
- struct proc_dir_entry *entry;
int err = 0;
if (!prof_on)
return 0;
-
- cpu_notifier_register_begin();
-
- if (create_hash_tables()) {
- err = -ENOMEM;
- goto out;
- }
-
+#ifdef CONFIG_SMP
+ err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+ profile_prepare_cpu, profile_dead_cpu);
+ if (err)
+ return err;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+ profile_online_cpu, NULL);
+ if (err < 0)
+ goto err_state_prep;
+ online_state = err;
+ err = 0;
+#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- goto out;
+ goto err_state_onl;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- __hotcpu_notifier(profile_cpu_callback, 0);
-out:
- cpu_notifier_register_done();
+ return err;
+err_state_onl:
+#ifdef CONFIG_SMP
+ cpuhp_remove_state(online_state);
+err_state_prep:
+ cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d49bfa1e5..1d3b7665d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
return -EINVAL;
if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
- !config_enabled(CONFIG_SECCOMP))
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cee0d839..d38ab08a3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
-torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state;
#define MAX_MEAS 10000
#define MIN_MEAS 100
-#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
-#define RCUPERF_RUNNABLE_INIT 1
-#else
-#define RCUPERF_RUNNABLE_INIT 0
-#endif
-static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+static int perf_runnable = IS_ENABLED(MODULE);
module_param(perf_runnable, int, 0444);
MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
@@ -363,8 +358,6 @@ rcu_perf_writer(void *arg)
u64 *wdpp = writer_durations[me];
VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
- WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
- WARN_ON(rcu_gp_is_normal() && gp_exp);
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
sp.sched_priority = 1;
@@ -631,12 +624,24 @@ rcu_perf_init(void)
firsterr = -ENOMEM;
goto unwind;
}
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (rcu_gp_is_normal() && gp_exp) {
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
for (i = 0; i < nrealwriters; i++) {
writer_durations[i] =
kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
GFP_KERNEL);
- if (!writer_durations[i])
+ if (!writer_durations[i]) {
+ firsterr = -ENOMEM;
goto unwind;
+ }
firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
writer_tasks[i]);
if (firsterr)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a73..971e2b138 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+static int torture_runnable = IS_ENABLED(MODULE);
module_param(torture_runnable, int, 0444);
MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
@@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg)
break;
/*
* The above smp_load_acquire() ensures barrier_phase load
- * is ordered before the folloiwng ->call().
+ * is ordered before the following ->call().
*/
local_irq_disable(); /* Just to test no-irq call_rcu(). */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f8..5d80925e7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
* can assume that there is but one task, allowing RCU to (for example)
- * optimize synchronize_sched() to a simple barrier(). When this variable
+ * optimize synchronize_rcu() to a simple barrier(). When this variable
* is one, RCU must actually do all the hard work required to detect real
* grace periods. This variable is also used to suppress boot-time false
* positives from lockdep-RCU error checking.
@@ -159,6 +161,7 @@ static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
static void rcu_report_exp_rdp(struct rcu_state *rsp,
struct rcu_data *rdp, bool wake);
+static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
* offline to continue to use RCU for one jiffy after marking itself
* offline in the cpu_online_mask. This leniency is necessary given the
* non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * the fact that a CPU enters the scheduler after completing the teardown
+ * of the CPU.
*
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * This is also why RCU internally marks CPUs online during in the
+ * preparation phase and offline after the CPU has been taken down.
*
* Disable checking if in an NMI handler because we cannot safely report
* errors from NMI handlers anyway.
@@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu))
- dump_cpu_task(rnp->grplo + cpu);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
}
}
+static inline void panic_on_rcu_stall(void)
+{
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
@@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu)) {
- print_cpu_stall_info(rsp,
- rnp->grplo + cpu);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(rsp, cpu);
ndetected++;
}
}
@@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
rcu_check_gp_kthread_starvation(rsp);
+ panic_on_rcu_stall();
+
force_quiescent_state(rsp); /* Kick them all. */
}
@@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ panic_on_rcu_stall();
+
/*
* Attempt to revive the RCU machinery by forcing a context switch.
*
@@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
* of the tree within the rsp->node[] array. Note that other CPUs
* will access only the leaves of the hierarchy, thus seeing that no
* grace period is in progress, at least until the corresponding
- * leaf node has been initialized. In addition, we have excluded
- * CPU-hotplug operations.
+ * leaf node has been initialized.
*
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
@@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj)
{
- unsigned long bit;
int cpu;
unsigned long flags;
unsigned long mask;
@@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
continue;
}
}
- cpu = rnp->grplo;
- bit = 1;
- for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
@@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)
return ULONG_CMP_GE(READ_ONCE(*sp), s);
}
-/* Wrapper functions for expedited grace periods. */
-static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
-{
- rcu_seq_start(&rsp->expedited_sequence);
-}
-static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
-{
- rcu_seq_end(&rsp->expedited_sequence);
- smp_mb(); /* Ensure that consecutive grace periods serialize. */
-}
-static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
-{
- unsigned long s;
-
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- s = rcu_seq_snap(&rsp->expedited_sequence);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
- return s;
-}
-static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
-{
- return rcu_seq_done(&rsp->expedited_sequence, s);
-}
-
-/*
- * Reset the ->expmaskinit values in the rcu_node tree to reflect any
- * recent CPU-online activity. Note that these masks are not cleared
- * when CPUs go offline, so they reflect the union of all CPUs that have
- * ever been online. This means that this function normally takes its
- * no-work-to-do fastpath.
- */
-static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
-{
- bool done;
- unsigned long flags;
- unsigned long mask;
- unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
- struct rcu_node *rnp;
- struct rcu_node *rnp_up;
-
- /* If no new CPUs onlined since last time, nothing to do. */
- if (likely(ncpus == rsp->ncpus_snap))
- return;
- rsp->ncpus_snap = ncpus;
-
- /*
- * Each pass through the following loop propagates newly onlined
- * CPUs for the current rcu_node structure up the rcu_node tree.
- */
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->expmaskinit == rnp->expmaskinitnext) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- continue; /* No new CPUs, nothing to do. */
- }
-
- /* Update this node's mask, track old value for propagation. */
- oldmask = rnp->expmaskinit;
- rnp->expmaskinit = rnp->expmaskinitnext;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- /* If was already nonzero, nothing to propagate. */
- if (oldmask)
- continue;
-
- /* Propagate the new CPU up the tree. */
- mask = rnp->grpmask;
- rnp_up = rnp->parent;
- done = false;
- while (rnp_up) {
- raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
- if (rnp_up->expmaskinit)
- done = true;
- rnp_up->expmaskinit |= mask;
- raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
- if (done)
- break;
- mask = rnp_up->grpmask;
- rnp_up = rnp_up->parent;
- }
- }
-}
-
-/*
- * Reset the ->expmask values in the rcu_node tree in preparation for
- * a new expedited grace period.
- */
-static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_node *rnp;
-
- sync_exp_reset_tree_hotplug(rsp);
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-}
-
-/*
- * Return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
- return rnp->exp_tasks == NULL &&
- READ_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period. This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree. (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
- */
-static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
-
- for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
- if (!rnp->expmask)
- rcu_initiate_boost(rnp, flags);
- else
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- break;
- }
- if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
- swake_up(&rsp->expedited_wq);
- }
- break;
- }
- mask = rnp->grpmask;
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
- rnp = rnp->parent;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
- WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
- }
-}
-
-/*
- * Report expedited quiescent state for specified node. This is a
- * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
- struct rcu_node *rnp, bool wake)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- __rcu_report_exp_rnp(rsp, rnp, wake, flags);
-}
-
-/*
- * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the rcu_state's
- * exp_mutex.
- */
-static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
- unsigned long mask, bool wake)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!(rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- rnp->expmask &= ~mask;
- __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
-}
-
-/*
- * Report expedited quiescent state for specified rcu_data (CPU).
- */
-static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
- bool wake)
-{
- rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
-}
-
-/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
- unsigned long s)
-{
- if (rcu_exp_gp_seq_done(rsp, s)) {
- trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
- /* Ensure test happens before caller kfree(). */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(stat);
- return true;
- }
- return false;
-}
-
-/*
- * Funnel-lock acquisition for expedited grace periods. Returns true
- * if some other task completed an expedited grace period that this task
- * can piggy-back on, and with no mutex held. Otherwise, returns false
- * with the mutex held, indicating that the caller must actually do the
- * expedited grace period.
- */
-static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
-{
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- /* Low-contention fastpath. */
- if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
- (rnp == rnp_root ||
- ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
- !mutex_is_locked(&rsp->exp_mutex) &&
- mutex_trylock(&rsp->exp_mutex))
- goto fastpath;
-
- /*
- * Each pass through the following loop works its way up
- * the rcu_node tree, returning if others have done the work or
- * otherwise falls through to acquire rsp->exp_mutex. The mapping
- * from CPU to rcu_node structure can be inexact, as it is just
- * promoting locality and is not strictly needed for correctness.
- */
- for (; rnp != NULL; rnp = rnp->parent) {
- if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
- return true;
-
- /* Work not done, either wait here or go up. */
- spin_lock(&rnp->exp_lock);
- if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
-
- /* Someone else doing GP, so wait for them. */
- spin_unlock(&rnp->exp_lock);
- trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
- rnp->grplo, rnp->grphi,
- TPS("wait"));
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone2, s));
- return true;
- }
- rnp->exp_seq_rq = s; /* Followers can wait on us. */
- spin_unlock(&rnp->exp_lock);
- trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
- rnp->grphi, TPS("nxtlvl"));
- }
- mutex_lock(&rsp->exp_mutex);
-fastpath:
- if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
- mutex_unlock(&rsp->exp_mutex);
- return true;
- }
- rcu_exp_gp_seq_start(rsp);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
- return false;
-}
-
-/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *data)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- struct rcu_state *rsp = data;
-
- rdp = this_cpu_ptr(rsp->rda);
- rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- if (rcu_is_cpu_rrupt_from_idle()) {
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data), true);
- return;
- }
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
- resched_cpu(smp_processor_id());
-}
-
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
- struct rcu_data *rdp;
- int ret;
- struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_sched_state;
-
- rdp = per_cpu_ptr(rsp->rda, cpu);
- rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
- return;
- ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
- WARN_ON_ONCE(ret);
-}
-
-/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
- */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
- smp_call_func_t func)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- unsigned long mask_ofl_test;
- unsigned long mask_ofl_ipi;
- int ret;
- struct rcu_node *rnp;
-
- sync_exp_reset_tree(rsp);
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
-
- /* Each pass checks a CPU for identity, offline, and idle. */
- mask_ofl_test = 0;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (raw_smp_processor_id() == cpu ||
- !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- mask_ofl_test |= rdp->grpmask;
- }
- mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited
- * GP until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- /* IPI the remaining CPUs for expedited quiescent state. */
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- if (!(mask_ofl_ipi & mask))
- continue;
-retry_ipi:
- ret = smp_call_function_single(cpu, func, rsp, 0);
- if (!ret) {
- mask_ofl_ipi &= ~mask;
- continue;
- }
- /* Failed, raced with offline. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- }
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
- /* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
- }
-}
-
-static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
-{
- int cpu;
- unsigned long jiffies_stall;
- unsigned long jiffies_start;
- unsigned long mask;
- int ndetected;
- struct rcu_node *rnp;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- int ret;
-
- jiffies_stall = rcu_jiffies_till_stall_check();
- jiffies_start = jiffies;
-
- for (;;) {
- ret = swait_event_timeout(
- rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root),
- jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
- return;
- if (ret < 0) {
- /* Hit a signal, disable CPU stall warnings. */
- swait_event(rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root));
- return;
- }
- pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
- rsp->name);
- ndetected = 0;
- rcu_for_each_leaf_node(rsp, rnp) {
- ndetected += rcu_print_task_exp_stall(rnp);
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- struct rcu_data *rdp;
-
- if (!(rnp->expmask & mask))
- continue;
- ndetected++;
- rdp = per_cpu_ptr(rsp->rda, cpu);
- pr_cont(" %d-%c%c%c", cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
- }
- mask <<= 1;
- }
- pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- jiffies - jiffies_start, rsp->expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
- if (ndetected) {
- pr_err("blocking rcu_node structures:");
- rcu_for_each_node_breadth_first(rsp, rnp) {
- if (rnp == rnp_root)
- continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done(rnp))
- continue;
- pr_cont(" l=%u:%d-%d:%#lx/%c",
- rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
- ".T"[!!rnp->exp_tasks]);
- }
- pr_cont("\n");
- }
- rcu_for_each_leaf_node(rsp, rnp) {
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- if (!(rnp->expmask & mask))
- continue;
- dump_cpu_task(cpu);
- }
- }
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
- }
-}
-
-/*
- * Wait for the current expedited grace period to complete, and then
- * wake up everyone who piggybacked on the just-completed expedited
- * grace period. Also update all the ->exp_seq_rq counters as needed
- * in order to avoid counter-wrap problems.
- */
-static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
-{
- struct rcu_node *rnp;
-
- synchronize_sched_expedited_wait(rsp);
- rcu_exp_gp_seq_end(rsp);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
- mutex_lock(&rsp->exp_wake_mutex);
- mutex_unlock(&rsp->exp_mutex);
-
- rcu_for_each_node_breadth_first(rsp, rnp) {
- if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
- spin_lock(&rnp->exp_lock);
- /* Recheck, avoid hang in case someone just arrived. */
- if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
- rnp->exp_seq_rq = s;
- spin_unlock(&rnp->exp_lock);
- }
- wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
- }
- trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
- mutex_unlock(&rsp->exp_wake_mutex);
-}
-
-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * This implementation can be thought of as an application of sequence
- * locking to expedited grace periods, but using the sequence counter to
- * determine when someone else has already done the work instead of for
- * retrying readers.
- */
-void synchronize_sched_expedited(void)
-{
- unsigned long s;
- struct rcu_state *rsp = &rcu_sched_state;
-
- /* If only one CPU, this is automatically a grace period. */
- if (rcu_blocking_is_gp())
- return;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_sched);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
rcu_init_percpu_data(cpu, rsp);
+
+ rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
+
+ return 0;
+}
+
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+
+ rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+
+int rcutree_online_cpu(unsigned int cpu)
+{
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
+ return 0;
+}
+
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ rcutree_affinity_setting(cpu, cpu);
+ return 0;
+}
+
+
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
+ return 0;
+}
+
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return;
-
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
@@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
}
#endif
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_state *rsp;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_prepare_cpu(cpu);
- rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- sync_sched_exp_online_cleanup(cpu);
- rcu_boost_kthread_setaffinity(rnp, -1);
- break;
- case CPU_DOWN_PREPARE:
- rcu_boost_kthread_setaffinity(rnp, cpu);
- break;
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- for_each_rcu_flavor(rsp)
- rcu_cleanup_dying_cpu(rsp);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dead_cpu(cpu, rsp);
- do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
- }
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -4745,10 +4208,10 @@ void __init rcu_init(void)
* this is called early in boot, before either interrupts
* or the scheduler are operational.
*/
- cpu_notifier(rcu_cpu_notify, 0);
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
- rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ rcutree_prepare_cpu(cpu);
}
+#include "tree_exp.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e3959f5e6..f714f873b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -254,6 +254,13 @@ struct rcu_node {
} ____cacheline_internodealigned_in_smp;
/*
+ * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
+ * are indexed relative to this interval rather than the global CPU ID space.
+ * This generates the bit for a CPU in node-local masks.
+ */
+#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
+
+/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
*/
@@ -281,6 +288,14 @@ struct rcu_node {
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+ for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+ cpu <= rnp->grphi; \
+ cpu = cpumask_next((cpu), cpu_possible_mask))
+
+/*
* Union to allow "aggregate OR" operation on the need for a quiescent
* state by the normal and expedited grace periods.
*/
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000..6d86ab6ec
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,655 @@
+/*
+ * RCU expedited grace periods
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2016
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = rcu_seq_snap(&rsp->expedited_sequence);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+ return s;
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity. Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online. This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+ bool done;
+ unsigned long flags;
+ unsigned long mask;
+ unsigned long oldmask;
+ int ncpus = READ_ONCE(rsp->ncpus);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_up;
+
+ /* If no new CPUs onlined since last time, nothing to do. */
+ if (likely(ncpus == rsp->ncpus_snap))
+ return;
+ rsp->ncpus_snap = ncpus;
+
+ /*
+ * Each pass through the following loop propagates newly onlined
+ * CPUs for the current rcu_node structure up the rcu_node tree.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->expmaskinit == rnp->expmaskinitnext) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue; /* No new CPUs, nothing to do. */
+ }
+
+ /* Update this node's mask, track old value for propagation. */
+ oldmask = rnp->expmaskinit;
+ rnp->expmaskinit = rnp->expmaskinitnext;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* If was already nonzero, nothing to propagate. */
+ if (oldmask)
+ continue;
+
+ /* Propagate the new CPU up the tree. */
+ mask = rnp->grpmask;
+ rnp_up = rnp->parent;
+ done = false;
+ while (rnp_up) {
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
+ if (rnp_up->expmaskinit)
+ done = true;
+ rnp_up->expmaskinit |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
+ if (done)
+ break;
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ }
+ }
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree_hotplug(rsp);
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ WARN_ON_ONCE(rnp->expmask);
+ rnp->expmask = rnp->expmaskinit;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period. Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+ return rnp->exp_tasks == NULL &&
+ READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long mask;
+
+ for (;;) {
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ if (!rnp->expmask)
+ rcu_initiate_boost(rnp, flags);
+ else
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ break;
+ }
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+ swake_up(&rsp->expedited_wq);
+ }
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
+ WARN_ON_ONCE(!(rnp->expmask & mask));
+ rnp->expmask &= ~mask;
+ }
+}
+
+/*
+ * Report expedited quiescent state for specified node. This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure. Caller must hold the rcu_state's
+ * exp_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+ unsigned long mask, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!(rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ rnp->expmask &= ~mask;
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+ bool wake)
+{
+ rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+ unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held. Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
+ */
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ /* Low-contention fastpath. */
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+ (rnp == rnp_root ||
+ ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+ mutex_trylock(&rsp->exp_mutex))
+ goto fastpath;
+
+ /*
+ * Each pass through the following loop works its way up
+ * the rcu_node tree, returning if others have done the work or
+ * otherwise falls through to acquire rsp->exp_mutex. The mapping
+ * from CPU to rcu_node structure can be inexact, as it is just
+ * promoting locality and is not strictly needed for correctness.
+ */
+ for (; rnp != NULL; rnp = rnp->parent) {
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ return true;
+
+ /* Work not done, either wait here or go up. */
+ spin_lock(&rnp->exp_lock);
+ if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+ /* Someone else doing GP, so wait for them. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+ rnp->grplo, rnp->grphi,
+ TPS("wait"));
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone2, s));
+ return true;
+ }
+ rnp->exp_seq_rq = s; /* Followers can wait on us. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+ rnp->grphi, TPS("nxtlvl"));
+ }
+ mutex_lock(&rsp->exp_mutex);
+fastpath:
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ mutex_unlock(&rsp->exp_mutex);
+ return true;
+ }
+ rcu_exp_gp_seq_start(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+ return false;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = data;
+
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ if (rcu_is_cpu_rrupt_from_idle()) {
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
+ return;
+ }
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ resched_cpu(smp_processor_id());
+}
+
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+ struct rcu_data *rdp;
+ int ret;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+ return;
+ ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+ WARN_ON_ONCE(ret);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask_ofl_test;
+ unsigned long mask_ofl_ipi;
+ int ret;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree(rsp);
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ mask_ofl_test |= rdp->grpmask;
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited
+ * GP until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(mask_ofl_ipi & mask))
+ continue;
+retry_ipi:
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with offline. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ schedule_timeout_uninterruptible(1);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask))
+ goto retry_ipi;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+ }
+}
+
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ unsigned long mask;
+ int ndetected;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = swait_event_timeout(
+ rsp->expedited_wq,
+ sync_rcu_preempt_exp_done(rnp_root),
+ jiffies_stall);
+ if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ swait_event(rsp->expedited_wq,
+ sync_rcu_preempt_exp_done(rnp_root));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
+ rsp->name);
+ ndetected = 0;
+ rcu_for_each_leaf_node(rsp, rnp) {
+ ndetected += rcu_print_task_exp_stall(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp;
+
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(rnp->expmask & mask))
+ continue;
+ ndetected++;
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ pr_cont(" %d-%c%c%c", cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ }
+ }
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ jiffies - jiffies_start, rsp->expedited_sequence,
+ rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ if (ndetected) {
+ pr_err("blocking rcu_node structures:");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_preempt_exp_done(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi,
+ rnp->expmask,
+ ".T"[!!rnp->exp_tasks]);
+ }
+ pr_cont("\n");
+ }
+ rcu_for_each_leaf_node(rsp, rnp) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(rnp->expmask & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period. Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_node *rnp;
+
+ synchronize_sched_expedited_wait(rsp);
+ rcu_exp_gp_seq_end(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+ /*
+ * Switch over to wakeup mode, allowing the next GP, but -only- the
+ * next GP, to proceed.
+ */
+ mutex_lock(&rsp->exp_wake_mutex);
+ mutex_unlock(&rsp->exp_mutex);
+
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+ spin_lock(&rnp->exp_lock);
+ /* Recheck, avoid hang in case someone just arrived. */
+ if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+ rnp->exp_seq_rq = s;
+ spin_unlock(&rnp->exp_lock);
+ }
+ wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ }
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+ mutex_unlock(&rsp->exp_wake_mutex);
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
+ */
+void synchronize_sched_expedited(void)
+{
+ unsigned long s;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ /* If only one CPU, this is automatically a grace period. */
+ if (rcu_blocking_is_gp())
+ return;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu_sched);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
+ if (exp_funnel_lock(rsp, s))
+ return; /* Someone else did our work for us. */
+
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Remote handler for smp_call_function_single(). If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree. Otherwise, immediately
+ * report the quiescent state.
+ */
+static void sync_rcu_exp_handler(void *info)
+{
+ struct rcu_data *rdp;
+ struct rcu_state *rsp = info;
+ struct task_struct *t = current;
+
+ /*
+ * Within an RCU read-side critical section, request that the next
+ * rcu_read_unlock() report. Unless this RCU read-side critical
+ * section has already blocked, in which case it is already set
+ * up for the expedited grace period to wait on it.
+ */
+ if (t->rcu_read_lock_nesting > 0 &&
+ !t->rcu_read_unlock_special.b.blocked) {
+ t->rcu_read_unlock_special.b.exp_need_qs = true;
+ return;
+ }
+
+ /*
+ * We are either exiting an RCU read-side critical section (negative
+ * values of t->rcu_read_lock_nesting) or are not in one at all
+ * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
+ * read-side critical section that blocked before this expedited
+ * grace period started. Either way, we can immediately report
+ * the quiescent state.
+ */
+ rdp = this_cpu_ptr(rsp->rda);
+ rcu_report_exp_rdp(rsp, rdp, true);
+}
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it. The basic
+ * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state. On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ */
+void synchronize_rcu_expedited(void)
+{
+ struct rcu_state *rsp = rcu_state_p;
+ unsigned long s;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
+ s = rcu_exp_gp_seq_snap(rsp);
+ if (exp_funnel_lock(rsp, s))
+ return; /* Someone else did our work for us. */
+
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
+
+ /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+ rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e11..0082fce40 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
- if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
- pr_info("\tRCU torture testing starts during boot.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -681,84 +679,6 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-/*
- * Remote handler for smp_call_function_single(). If there is an
- * RCU read-side critical section in effect, request that the
- * next rcu_read_unlock() record the quiescent state up the
- * ->expmask fields in the rcu_node tree. Otherwise, immediately
- * report the quiescent state.
- */
-static void sync_rcu_exp_handler(void *info)
-{
- struct rcu_data *rdp;
- struct rcu_state *rsp = info;
- struct task_struct *t = current;
-
- /*
- * Within an RCU read-side critical section, request that the next
- * rcu_read_unlock() report. Unless this RCU read-side critical
- * section has already blocked, in which case it is already set
- * up for the expedited grace period to wait on it.
- */
- if (t->rcu_read_lock_nesting > 0 &&
- !t->rcu_read_unlock_special.b.blocked) {
- t->rcu_read_unlock_special.b.exp_need_qs = true;
- return;
- }
-
- /*
- * We are either exiting an RCU read-side critical section (negative
- * values of t->rcu_read_lock_nesting) or are not in one at all
- * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
- * read-side critical section that blocked before this expedited
- * grace period started. Either way, we can immediately report
- * the quiescent state.
- */
- rdp = this_cpu_ptr(rsp->rda);
- rcu_report_exp_rdp(rsp, rdp, true);
-}
-
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state. On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code. In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- */
-void synchronize_rcu_expedited(void)
-{
- struct rcu_state *rsp = rcu_state_p;
- unsigned long s;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
-
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-
- /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
- rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
/**
* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
*
@@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void)
}
/*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
- synchronize_sched_expedited();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
@@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
- if ((mask & 0x1) && cpu != outgoingcpu)
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+ cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3e888cd5a..f0d8322bc 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
+static struct task_struct *rcu_tasks_kthread_ptr;
/*
* Post an RCU-tasks callback. First call must be from process context
@@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
+ bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
*rcu_tasks_cbs_tail = rhp;
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
- if (needwake) {
+ /* We can't create the thread unless interrupts are enabled. */
+ if ((needwake && havetask) ||
+ (!havetask && !irqs_disabled_flags(flags))) {
rcu_spawn_tasks_kthread();
wake_up(&rcu_tasks_cbs_wq);
}
@@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void rcu_spawn_tasks_kthread(void)
{
static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
- static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
if (READ_ONCE(rcu_tasks_kthread_ptr)) {
diff --git a/kernel/relay.c b/kernel/relay.c
index 04d7cf3ef..d79750214 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
+ } else {
+ /* Only retrieve global info, nothing more, nothing less */
+ dentry = chan->cb->create_buf_file(NULL, NULL,
+ S_IRUSR, buf,
+ &chan->is_global);
+ if (WARN_ON(dentry))
+ goto free_buf;
}
buf->cpu = cpu;
@@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb,
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
+ *
+ * If opening a buffer (@parent = NULL) that you later wish to register
+ * in a filesystem, call relay_late_setup_files() once the @parent dentry
+ * is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
@@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info)
*
* Returns 0 if successful, non-zero otherwise.
*
- * Use to setup files for a previously buffer-only channel.
- * Useful to do early tracing in kernel, before VFS is up, for example.
+ * Use to setup files for a previously buffer-only channel created
+ * by relay_open() with a NULL parent dentry.
+ *
+ * For example, this is useful for perfomring early tracing in kernel,
+ * before VFS is up and then exposing the early results once the dentry
+ * is available.
*/
int relay_late_setup_files(struct rchan *chan,
const char *base_filename,
@@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan,
}
chan->has_base_filename = 1;
chan->parent = parent;
+
+ if (chan->is_global) {
+ err = -EINVAL;
+ if (!WARN_ON_ONCE(!chan->buf[0])) {
+ dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+ if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
+ relay_set_buf_dentry(chan->buf[0], dentry);
+ err = 0;
+ }
+ }
+ mutex_unlock(&relay_channels_mutex);
+ return err;
+ }
+
curr_cpu = get_cpu();
/*
* The CPU hotplug notifier ran before us and created buffers with
@@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan,
return err;
}
+EXPORT_SYMBOL_GPL(relay_late_setup_files);
/**
* relay_switch_subbuf - switch to a new sub-buffer
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
index 67f93e752..bb5bac4b2 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/bfs.c
@@ -24,7 +24,7 @@
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
* Thomas Gleixner, Mike Kravetz
- * now Brainfuck deadline scheduling policy by Con Kolivas deletes
+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
* a whole lot of those previous things.
*/
@@ -137,7 +137,7 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n");
+ printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n");
}
/*
@@ -403,7 +403,6 @@ static inline void grq_lock_irq(void)
}
static inline void time_lock_grq(struct rq *rq)
- __acquires(grq.lock)
{
grq_lock();
update_clocks(rq);
@@ -429,86 +428,35 @@ static inline void grq_unlock_irqrestore(unsigned long *flags)
static inline struct rq
*task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
+ __acquires(p->pi_lock)
{
- grq_lock_irqsave(flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ grq_lock();
return task_rq(p);
}
static inline struct rq
*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
{
struct rq *rq = task_grq_lock(p, flags);
- update_clocks(rq);
- return rq;
-}
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock_irq();
- return task_rq(p);
-}
-
-static inline void time_task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- struct rq *rq = task_grq_lock_irq(p);
update_clocks(rq);
+ return rq;
}
-static inline void task_grq_unlock_irq(void)
- __releases(grq.lock)
-{
- grq_unlock_irq();
-}
-
-static inline void task_grq_unlock(unsigned long *flags)
- __releases(grq.lock)
-{
- grq_unlock_irqrestore(flags);
-}
-
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
-{
- return raw_spin_is_locked(&grq.lock);
-}
-
-void grq_unlock_wait(void)
- __releases(grq.lock)
+static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags)
+ __releases(p->pi_lock)
{
- smp_mb(); /* spin-unlock-wait is not a full memory barrier */
- raw_spin_unlock_wait(&grq.lock);
+ grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
- __acquires(grq.lock)
{
local_irq_save(*flags);
time_lock_grq(rq);
}
-static inline struct rq *__task_grq_lock(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock();
- return task_rq(p);
-}
-
-static inline void __task_grq_unlock(void)
- __releases(grq.lock)
-{
- grq_unlock();
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
@@ -540,6 +488,40 @@ static inline bool deadline_after(u64 deadline, u64 time)
}
/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+ return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+ return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
* A task that is not running or queued will not have a node set.
* A task that is queued but not running will have a node set.
* A task that is currently running will have ->on_cpu set but no node set.
@@ -561,14 +543,23 @@ static void dequeue_task(struct task_struct *p)
sched_info_dequeued(task_rq(p), p);
}
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+ return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
/*
* To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
* an idle task, we ensure none of the following conditions are met.
*/
static bool idleprio_suitable(struct task_struct *p)
{
- return (!freezing(p) && !signal_pending(p) &&
- !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
}
/*
@@ -612,9 +603,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
sl_id = p->prio;
else {
sl_id = p->deadline;
- /* Set it to cope with 4 left shifts with locality_diff */
- if (p->prio == IDLE_PRIO)
- sl_id |= 0x0F00000000000000;
+ if (idleprio_task(p)) {
+ /* Set it to cope with 4 left shifts with locality_diff */
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0x00FF000000000000;
+ else
+ sl_id += longest_deadline_diff();
+ }
}
/*
* Some architectures don't have better than microsecond resolution
@@ -1008,15 +1003,18 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq)
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
- unsigned int tcpu;
-
#ifdef CONFIG_LOCKDEP
/*
- * The caller should hold grq lock.
+ * The caller should hold either p->pi_lock or grq lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks.
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_grq_lock().
*/
- WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&grq.lock)));
#endif
- if ((tcpu = task_cpu(p)) == cpu)
+ if (task_cpu(p) == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
@@ -1027,6 +1025,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
+
if (p->on_rq) {
struct rq *rq = task_rq(p);
@@ -1166,7 +1165,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -1292,9 +1291,7 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
static void try_preempt(struct task_struct *p, struct rq *this_rq)
{
- int cpu, pcpu, highest_prio, highest_cpu;
- struct rq *highest_prio_rq;
- u64 latest_deadline;
+ int i, this_entries = this_rq->soft_affined;
cpumask_t tmp;
if (suitable_idle_cpus(p) && resched_best_idle(p))
@@ -1306,56 +1303,32 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
- /* See if this task can preempt the task on the current CPU first. */
- pcpu = cpu_of(this_rq);
- if (likely(cpumask_test_cpu(pcpu, &tmp))) {
- if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
- resched_curr(this_rq);
- return;
- }
- cpumask_clear_cpu(pcpu, &tmp);
- }
-
- highest_prio = latest_deadline = 0;
- highest_prio_rq = NULL;
-
- /* Now look for the CPU with the latest deadline */
- for_each_cpu(cpu, &tmp) {
- struct rq *rq;
- int rq_prio;
- u64 dl;
+ /*
+ * We iterate over CPUs in locality order using rq_order, finding the
+ * first one we can preempt if possible, thus staying closest in
+ * locality.
+ */
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *rq = this_rq->rq_order[i];
- rq = cpu_rq(cpu);
- rq_prio = rq->rq_prio;
- if (rq_prio < highest_prio)
+ if (!cpumask_test_cpu(rq->cpu, &tmp))
continue;
- dl = rq->rq_deadline;
- if (!sched_interactive && pcpu != cpu)
- dl <<= locality_diff(pcpu, rq);
- if (rq_prio > highest_prio ||
- deadline_after(dl, latest_deadline)) {
- latest_deadline = dl;
- highest_prio = rq_prio;
- highest_cpu = cpu;
- highest_prio_rq = rq;
+ if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries)
+ continue;
+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+ /*
+ * If we have decided this task should preempt this CPU,
+ * set the task's CPU to match thereby speeding up matching
+ * this task in earliest_deadline_task.
+ */
+ set_task_cpu(p, rq->cpu);
+ resched_curr(rq);
+ return;
}
}
-
- if (unlikely(!highest_prio_rq))
- return;
- if (!smt_schedule(p, highest_prio_rq))
- return;
- if (can_preempt(p, highest_prio, latest_deadline)) {
- /*
- * If we have decided this task should preempt this CPU,
- * set the task's CPU to match thereby speeding up matching
- * this task in earliest_deadline_task.
- */
- set_task_cpu(p, highest_cpu);
- resched_curr(highest_prio_rq);
- }
}
+
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check);
#else /* CONFIG_SMP */
@@ -1501,8 +1474,6 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
struct rq *rq;
int cpu;
- get_cpu();
-
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -1533,13 +1504,11 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
out_running:
ttwu_post_activation(p, rq, success);
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
- put_cpu();
-
return success;
}
@@ -1629,6 +1598,13 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
skiplist_node_init(&p->node);
/*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
@@ -1744,12 +1720,16 @@ static inline void init_schedstats(void) {}
*/
void wake_up_new_task(struct task_struct *p)
{
- struct task_struct *parent;
+ struct task_struct *parent, *rq_curr;
+ struct rq *rq, *new_rq;
unsigned long flags;
- struct rq *rq;
parent = p->parent;
rq = task_grq_lock(p, &flags);
+ if (unlikely(needs_other_cpu(p, task_cpu(p))))
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ rq_curr = rq->curr;
+ p->state = TASK_RUNNING;
/*
* Reinit new task deadline as its creator deadline could have changed
@@ -1757,22 +1737,20 @@ void wake_up_new_task(struct task_struct *p)
*/
p->deadline = rq->rq_deadline;
- /*
- * If the task is a new process, current and parent are the same. If
- * the task is a new thread in the thread group, it will have much more
- * in common with current than with the parent.
- */
- set_task_cpu(p, task_cpu(rq->curr));
+ /* The new task might not be able to run on the same CPU as rq->curr */
+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ new_rq = task_rq(p);
+ } else
+ new_rq = rq;
/*
* Make sure we do not leak PI boosting priority to the child.
*/
- p->prio = rq->curr->normal_prio;
+ p->prio = rq_curr->normal_prio;
activate_task(p, rq);
trace_sched_wakeup_new(p);
- if (unlikely(p->policy == SCHED_FIFO))
- goto after_ts_init;
/*
* Share the timeslice between parent and child, thus the
@@ -1784,33 +1762,39 @@ void wake_up_new_task(struct task_struct *p)
* is always equal to current->deadline.
*/
p->last_ran = rq->rq_last_ran;
- if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+ if (likely(rq_curr->policy != SCHED_FIFO)) {
rq->rq_time_slice /= 2;
- p->time_slice = rq->rq_time_slice;
-after_ts_init:
- if (rq->curr == parent && !suitable_idle_cpus(p)) {
+ if (unlikely(rq->rq_time_slice < RESCHED_US)) {
/*
- * The VM isn't cloned, so we're in a good position to
- * do child-runs-first in anticipation of an exec. This
- * usually avoids a lot of COW overhead.
+ * Forking task has run out of timeslice. Reschedule it and
+ * start its child with a new time slice and deadline. The
+ * child will end up running first because its deadline will
+ * be slightly earlier.
*/
- __set_tsk_resched(parent);
- } else
- try_preempt(p, rq);
- } else {
- if (rq->curr == parent) {
- /*
- * Forking task has run out of timeslice. Reschedule it and
- * start its child with a new time slice and deadline. The
- * child will end up running first because its deadline will
- * be slightly earlier.
- */
rq->rq_time_slice = 0;
- __set_tsk_resched(parent);
+ __set_tsk_resched(rq_curr);
+ time_slice_expired(p);
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p);
+ else if (unlikely(rq != new_rq))
+ try_preempt(p, new_rq);
+ } else {
+ p->time_slice = rq->rq_time_slice;
+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+ /*
+ * The VM isn't cloned, so we're in a good position to
+ * do child-runs-first in anticipation of an exec. This
+ * usually avoids a lot of COW overhead.
+ */
+ __set_tsk_resched(rq_curr);
+ } else
+ try_preempt(p, new_rq);
}
+ } else {
time_slice_expired(p);
+ try_preempt(p, new_rq);
}
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2724,7 +2708,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_grq_lock(p, &flags);
ns = p->sched_time + do_task_delta_exec(p, rq);
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
return ns;
}
@@ -2978,7 +2962,7 @@ static void task_running_tick(struct rq *rq)
grq_lock();
requeue_task(p);
- __set_tsk_resched(p);
+ resched_task(p);
grq_unlock();
}
@@ -3083,40 +3067,6 @@ static inline void preempt_latency_stop(int val) { }
#endif
/*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
- return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
- return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
- return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
- return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
- return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
* The time_slice is only refilled when it is empty and that is when we set a
* new deadline.
*/
@@ -3215,13 +3165,12 @@ found_middle:
static inline struct
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
{
- struct task_struct *edt = idle;
skiplist_node *node = &grq.node;
+ struct task_struct *edt = idle;
u64 earliest_deadline = ~0ULL;
while ((node = node->next[0]) != &grq.node) {
struct task_struct *p = node->value;
- int tcpu;
/* Make sure affinity is ok */
if (needs_other_cpu(p, cpu))
@@ -3230,22 +3179,24 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *
if (!smt_schedule(p, rq))
continue;
- if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
- u64 dl = p->deadline << locality_diff(tcpu, rq);
+ if (!sched_interactive) {
+ int tcpu;
+
+ if ((tcpu = task_cpu(p)) != cpu) {
+ u64 dl = p->deadline << locality_diff(tcpu, rq);
- if (unlikely(!deadline_before(dl, earliest_deadline)))
+ if (!deadline_before(dl, earliest_deadline))
+ continue;
+ earliest_deadline = dl;
+ edt = p;
+ /* We continue even though we've found the earliest
+ * deadline task as the locality offset means there
+ * may be a better candidate after it. */
continue;
- earliest_deadline = dl;
- edt = p;
- /* We continue even though we've found the earliest
- * deadline task as the locality offset means there
- * may be a better candidate after it. */
- continue;
+ }
}
- /* This wouldn't happen if we encountered a better deadline from
- * another CPU and have already set edt. */
- if (likely(p->deadline < earliest_deadline))
- edt = p;
+ /* We've encountered the best deadline local task */
+ edt = p;
break;
}
if (likely(edt != idle))
@@ -3275,6 +3226,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3316,10 +3270,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p)
rq->rq_mm = p->mm;
rq->rq_smt_bias = p->smt_bias;
#endif
- if (p != rq->idle)
- rq->rq_running = true;
- else
- rq->rq_running = false;
}
static void reset_rq_task(struct rq *rq, struct task_struct *p)
@@ -3353,7 +3303,7 @@ static void check_smt_siblings(struct rq *this_rq)
if (unlikely(!rq->online))
continue;
p = rq->curr;
- if (!smt_should_schedule(p, this_rq)) {
+ if (!smt_schedule(p, this_rq)) {
set_tsk_need_resched(p);
smp_send_reschedule(other_cpu);
}
@@ -3546,8 +3496,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the grq */
- cpu = cpu_of(rq);
- idle = rq->idle;
} else {
check_siblings(rq);
grq_unlock_irq();
@@ -3766,8 +3714,8 @@ EXPORT_SYMBOL(default_wake_function);
void rt_mutex_setprio(struct task_struct *p, int prio)
{
unsigned long flags;
- int queued, oldprio;
struct rq *rq;
+ int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3793,19 +3741,18 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
p->prio = prio;
- if (task_running(p) && prio > oldprio)
- resched_task(p);
- if (queued) {
+ if (task_running(p)){
+ if (prio > oldprio)
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(p);
enqueue_task(p, rq);
- try_preempt(p, rq);
+ if (prio < oldprio)
+ try_preempt(p, rq);
}
-
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
#endif
@@ -3821,7 +3768,7 @@ static inline void adjust_deadline(struct task_struct *p, int new_prio)
void set_user_nice(struct task_struct *p, long nice)
{
- int queued, new_static, old_static;
+ int new_static, old_static;
unsigned long flags;
struct rq *rq;
@@ -3843,16 +3790,14 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = new_static;
goto out_unlock;
}
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
adjust_deadline(p, new_static);
old_static = p->static_prio;
p->static_prio = new_static;
p->prio = effective_prio(p);
- if (queued) {
+ if (task_queued(p)) {
+ dequeue_task(p);
enqueue_task(p, rq);
if (new_static < old_static)
try_preempt(p, rq);
@@ -3862,7 +3807,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(p);
}
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4002,11 +3947,15 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
} else
p->prio = p->normal_prio;
+
if (task_running(p)) {
reset_rq_task(rq, p);
- /* Resched only if we might now be preempted */
- if (p->prio > oldprio || p->rt_priority > oldrtprio)
- resched_task(p);
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(p);
+ enqueue_task(p, rq);
+ if (p->prio < oldprio || p->rt_priority > oldrtprio)
+ try_preempt(p, rq);
}
}
@@ -4031,8 +3980,8 @@ __sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param, bool user, bool pi)
{
struct sched_param zero_param = { .sched_priority = 0 };
- int queued, retval, oldpolicy = -1;
unsigned long flags, rlim_rtprio = 0;
+ int retval, oldpolicy = -1;
int reset_on_fork;
struct rq *rq;
@@ -4142,20 +4091,17 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
+ *
* To be able to change p->policy safely, the grunqueue lock must be
* held.
*/
- rq = __task_grq_lock(p);
+ rq = task_grq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
return -EINVAL;
}
@@ -4165,31 +4111,21 @@ recheck:
if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
return 0;
}
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
goto recheck;
}
update_clocks(rq);
p->sched_reset_on_fork = reset_on_fork;
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
__setscheduler(p, rq, policy, param->sched_priority, pi);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
if (pi)
rt_mutex_adjust_pi(p);
@@ -4706,7 +4642,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -5113,6 +5050,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
cpumask_copy(tsk_cpus_allowed(p), new_mask);
+ if (needs_other_cpu(p, task_cpu(p)))
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
}
#endif
@@ -5376,6 +5315,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool running_wrong = false;
+ struct cpumask old_mask;
bool queued = false;
unsigned long flags;
struct rq *rq;
@@ -5399,7 +5339,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+ cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+ if (cpumask_equal(&old_mask, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -5436,12 +5377,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
out:
- if (queued)
+ if (queued && !cpumask_subset(new_mask, &old_mask))
try_preempt(p, rq);
- task_grq_unlock(&flags);
-
if (running_wrong)
- preempt_schedule_common();
+ preempt_disable();
+ task_grq_unlock(p, &flags);
+
+ if (running_wrong) {
+ __schedule(true);
+ preempt_enable();
+ }
return ret;
}
@@ -5471,6 +5416,11 @@ static void bind_zero(int src_cpu)
cpumask_set_cpu(0, tsk_cpus_allowed(p));
p->zerobound = true;
bound++;
+ if (task_cpu(p) == src_cpu) {
+ set_task_cpu(p, 0);
+ if (task_running(p))
+ resched_task(p);
+ }
}
} while_each_thread(t, p);
@@ -7008,6 +6958,7 @@ void __init sched_init_smp(void)
#ifdef CONFIG_SCHED_SMT
bool smt_threads = false;
#endif
+ struct rq *rq;
cpumask_var_t non_isolated_cpus;
@@ -7045,7 +6996,7 @@ void __init sched_init_smp(void)
* nodes) are treated as very distant.
*/
for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ rq = cpu_rq(cpu);
/* First check if this cpu is in the same node */
for_each_domain(cpu, sd) {
@@ -7084,6 +7035,17 @@ void __init sched_init_smp(void)
}
#endif
}
+ for_each_possible_cpu(cpu) {
+ int total_cpus = 0, locality;
+
+ rq = cpu_rq(cpu);
+ for (locality = 0; locality <= 4; locality++) {
+ for_each_possible_cpu(other_cpu) {
+ if (rq->cpu_locality[other_cpu] == locality)
+ rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+ }
+ }
+ }
#ifdef CONFIG_SMT_NICE
if (smt_threads) {
check_siblings = &check_smt_siblings;
@@ -7095,7 +7057,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ rq = cpu_rq(cpu);
+
for_each_online_cpu(other_cpu) {
if (other_cpu <= cpu)
continue;
@@ -7220,6 +7183,10 @@ void __init sched_init(void)
else
rq->cpu_locality[j] = 4;
}
+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+ rq->rq_order[0] = rq;
+ for (j = 1; j < cpu_ids; j++)
+ rq->rq_order[j] = cpu_rq(j);
}
#endif
@@ -7323,7 +7290,6 @@ static inline void normalise_rt_tasks(void)
struct task_struct *g, *p;
unsigned long flags;
struct rq *rq;
- int queued;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7337,16 +7303,8 @@ static inline void normalise_rt_tasks(void)
continue;
rq = task_grq_lock(p, &flags);
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
__setscheduler(p, rq, SCHED_NORMAL, 0, false);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
-
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
read_unlock(&tasklist_lock);
}
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
index e7fe1d0a5..00a16ba0a 100644
--- a/kernel/sched/bfs_sched.h
+++ b/kernel/sched/bfs_sched.h
@@ -22,7 +22,6 @@ struct rq {
int rq_time_slice;
u64 rq_last_ran;
int rq_prio;
- bool rq_running; /* There is a task running */
int soft_affined; /* Running or queued tasks with this set as their rq */
u64 load_update; /* When we last updated load */
unsigned long load_avg; /* Rolling load average */
@@ -43,6 +42,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
int *cpu_locality; /* CPU relative cache distance */
+ struct rq **rq_order; /* RQs ordered by relative cache distance */
#ifdef CONFIG_SCHED_SMT
cpumask_t thread_mask;
bool (*siblings_idle)(struct rq *rq);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 38eacc323..44817c640 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -1937,7 +1938,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
@@ -1948,7 +1949,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1974,7 +1975,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
@@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2364,11 +2365,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2405,8 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2416,7 +2416,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2548,16 +2554,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
@@ -2987,6 +2995,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -3020,6 +3045,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
@@ -3183,6 +3209,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -4774,7 +4803,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -7255,7 +7285,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7735,6 +7764,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7763,27 +7794,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7796,11 +7809,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8228,15 +8267,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8247,8 +8298,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0..bc0b309c3 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
-enum cpuacct_usage_index {
- CPUACCT_USAGE_USER, /* ... user mode */
- CPUACCT_USAGE_SYSTEM, /* ... kernel mode */
-
- CPUACCT_USAGE_NRUSAGE,
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
};
struct cpuacct_usage {
- u64 usages[CPUACCT_USAGE_NRUSAGE];
+ u64 usages[CPUACCT_STAT_NSTATS];
};
/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
/*
- * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+ * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages.
*/
- BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+ BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT
/*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- if (index == CPUACCT_USAGE_NRUSAGE) {
+ if (index == CPUACCT_STAT_NSTATS) {
int i = 0;
data = 0;
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i];
} else {
data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_USER);
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
}
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
}
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
}
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
}
static int __cpuacct_percpu_seq_show(struct seq_file *m,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
}
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
}
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
}
-static const char * const cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+ seq_printf(m, "%d", cpu);
+
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit
+ * platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+ }
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
+ s64 val[CPUACCT_STAT_NSTATS];
int cpu;
- s64 val = 0;
+ int stat;
+ memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val = 0;
- for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
}
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %lld\n",
+ cpuacct_stat_desc[stat],
+ cputime64_to_clock_t(val[stat]));
+ }
return 0;
}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
.seq_show = cpuacct_percpu_sys_seq_show,
},
{
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_USAGE_SYSTEM;
+ int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs))
- index = CPUACCT_USAGE_USER;
+ index = CPUACCT_STAT_USER;
rcu_read_lock();
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be588204..d4184498c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
if (old_idx == IDX_INVALID) {
cp->size++;
- cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].dl = dl;
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3d3ab8205..eba226d7c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -51,6 +51,8 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cached_raw_freq;
+
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
@@ -110,7 +112,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -125,14 +127,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
- unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+ unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- return (freq + (freq >> 2)) * util / max;
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_cpu->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -147,13 +160,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
return;
next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
- get_next_freq(policy, util, max);
+ get_next_freq(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
unsigned long util, unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -193,7 +207,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
}
}
- return get_next_freq(policy, util, max);
+ return get_next_freq(sg_cpu, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -210,7 +224,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_policy, util, max);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
@@ -398,7 +412,7 @@ static int sugov_init(struct cpufreq_policy *policy)
return ret;
}
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
struct sugov_tunables *tunables = sg_policy->tunables;
@@ -416,7 +430,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
- return 0;
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -438,6 +451,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->util = ULONG_MAX;
sg_cpu->max = 0;
sg_cpu->last_update = 0;
+ sg_cpu->cached_raw_freq = 0;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
@@ -448,7 +462,7 @@ static int sugov_start(struct cpufreq_policy *policy)
return 0;
}
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
@@ -460,53 +474,29 @@ static int sugov_stop(struct cpufreq_policy *policy)
irq_work_sync(&sg_policy->irq_work);
cancel_work_sync(&sg_policy->work);
- return 0;
}
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
-
- if (policy->max < policy->cur)
- __cpufreq_driver_target(policy, policy->max,
- CPUFREQ_RELATION_H);
- else if (policy->min > policy->cur)
- __cpufreq_driver_target(policy, policy->min,
- CPUFREQ_RELATION_L);
-
+ cpufreq_policy_apply_limits(policy);
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->need_freq_update = true;
- return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
- if (event == CPUFREQ_GOV_POLICY_INIT) {
- return sugov_init(policy);
- } else if (policy->governor_data) {
- switch (event) {
- case CPUFREQ_GOV_POLICY_EXIT:
- return sugov_exit(policy);
- case CPUFREQ_GOV_START:
- return sugov_start(policy);
- case CPUFREQ_GOV_STOP:
- return sugov_stop(policy);
- case CPUFREQ_GOV_LIMITS:
- return sugov_limits(policy);
- }
- }
- return -EINVAL;
}
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
- .governor = sugov_governor,
.owner = THIS_MODULE,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
};
static int __init sugov_module_init(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a24cfb41d..a846cf89e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
*/
void irqtime_account_irq(struct task_struct *curr)
{
- unsigned long flags;
s64 delta;
int cpu;
if (!sched_clock_irqtime)
return;
- local_irq_save(flags);
-
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t irq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
+ irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+ cpustat[CPUTIME_IRQ];
+ irq_cputime = min(irq_cputime, maxtime);
+ cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
- return ret;
+ return irq_cputime;
}
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t softirq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
+ softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+ cpustat[CPUTIME_SOFTIRQ];
+ softirq_cputime = min(softirq_cputime, maxtime);
+ cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
- return ret;
+ return softirq_cputime;
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+ return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+ return 0;
+}
+
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,29 +263,47 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
+ cputime_t steal_cputime;
u64 steal;
- unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- /*
- * steal is in nsecs but our caller is expecting steal
- * time in jiffies. Lets cast the result to jiffies
- * granularity and account the rest on the next rounds.
- */
- steal_jiffies = nsecs_to_jiffies(steal);
- this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+ steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+ account_steal_time(steal_cputime);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
- account_steal_time(jiffies_to_cputime(steal_jiffies));
- return steal_jiffies;
+ return steal_cputime;
}
#endif
- return false;
+ return 0;
+}
+
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+ cputime_t accounted;
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_account_hi_update(max - accounted);
+
+ if (accounted < max)
+ accounted += irqtime_account_si_update(max - accounted);
+
+ return accounted;
}
/*
@@ -342,21 +366,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+ cputime_t scaled, other;
- if (steal_account_process_tick())
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(ULONG_MAX);
+ if (other >= cputime)
return;
+ cputime -= other;
+ scaled = cputime_to_scaled(cputime);
- cputime *= ticks;
- scaled *= ticks;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
@@ -406,6 +432,10 @@ void vtime_common_task_switch(struct task_struct *prev)
}
#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
@@ -415,33 +445,16 @@ void vtime_common_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
- }
- vtime_account_system(tsk);
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
+ else
+ vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
@@ -466,7 +479,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t cputime, scaled, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -477,26 +490,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- if (steal_account_process_tick())
+ cputime = cputime_one_jiffy;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
return;
+ cputime -= steal;
+ scaled = cputime_to_scaled(cputime);
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_idle_time(cputime);
}
/*
@@ -505,13 +513,21 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
@@ -686,12 +702,21 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta = now - tsk->vtime_snap;
+ cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+ delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
- return jiffies_to_cputime(delta);
+ return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -711,16 +736,6 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_end(&tsk->vtime_seqcount);
}
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- if (context_tracking_in_user())
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
-}
-
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217..1ce886728 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
- if (unlikely(!rq->online))
+ if (unlikely(!rq->online)) {
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+ }
/*
* Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c393a..2a0a99952 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations);
+#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b5743d5b0..4309c8e76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -715,6 +715,11 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -745,6 +750,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -758,16 +765,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = now;
+ return;
+ }
+ }
+
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct sched_entity *se)
{
}
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1328,6 +1361,8 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1395,31 +1430,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1502,7 +1517,6 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1528,16 +1542,9 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2891,8 +2898,6 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -2939,7 +2944,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
@@ -2994,6 +3015,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0);
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
@@ -3002,6 +3031,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3023,6 +3054,14 @@ skip_aging:
cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3107,11 +3146,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3134,6 +3176,12 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
static inline void update_load_avg(struct sched_entity *se, int not_used)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3723,7 +3771,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3861,13 +3909,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
@@ -4220,26 +4266,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- /* Synchronize hierarchical throttle counter: */
- if (unlikely(!cfs_rq->throttle_uptodate)) {
- struct rq *rq = rq_of(cfs_rq);
- struct cfs_rq *pcfs_rq;
- struct task_group *tg;
-
- cfs_rq->throttle_uptodate = 1;
-
- /* Get closest up-to-date node, because leaves go first: */
- for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
- pcfs_rq = tg->cfs_rq[cpu_of(rq)];
- if (pcfs_rq->throttle_uptodate)
- break;
- }
- if (tg) {
- cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
- }
- }
-
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -4254,6 +4280,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4393,6 +4436,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4501,7 +4545,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -8342,31 +8386,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8379,8 +8409,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -8436,6 +8465,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (!vruntime_normalized(p)) {
/*
@@ -8447,13 +8478,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8464,7 +8500,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8524,6 +8563,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8536,6 +8583,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8587,10 +8647,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
-
- raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
- raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -8601,6 +8657,23 @@ err:
return 0;
}
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ sync_throttle(tg, i);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+}
+
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
@@ -8705,6 +8778,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
+void online_fair_sched_group(struct task_group *tg) { }
+
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8764,7 +8839,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e362a836c..1e855dcbd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,6 +205,8 @@ exit_idle:
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -223,7 +225,7 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 898c0d2f1..c64fc5114 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -437,7 +438,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count, throttle_uptodate;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1246,8 +1247,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
-#endif
-}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7002796f1..0db7c8a2a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return;
@@ -554,20 +554,10 @@ void secure_computing_strict(int this_syscall)
BUG();
}
#else
-int __secure_computing(void)
-{
- u32 phase1_result = seccomp_phase1(NULL);
-
- if (likely(phase1_result == SECCOMP_PHASE1_OK))
- return 0;
- else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
- return -1;
- else
- return seccomp_phase2(phase1_result);
-}
#ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
{
u32 filter_ret, action;
int data;
@@ -599,10 +589,50 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
goto skip;
case SECCOMP_RET_TRACE:
- return filter_ret; /* Save the rest for phase 2. */
+ /* We've been put in this state by the ptracer already. */
+ if (recheck_after_trace)
+ return 0;
+
+ /* ENOSYS these calls if there is no tracer attached. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current,
+ task_pt_regs(current),
+ -ENOSYS, 0);
+ goto skip;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification,
+ * which could leave us with a potentially unmodified
+ * syscall that the tracer would have liked to have
+ * changed. Since the process is about to die, we just
+ * force the syscall to be skipped and let the signal
+ * kill the process and correctly handle any tracer exit
+ * notifications.
+ */
+ if (fatal_signal_pending(current))
+ goto skip;
+ /* Check if the tracer forced the syscall to be skipped. */
+ this_syscall = syscall_get_nr(current, task_pt_regs(current));
+ if (this_syscall < 0)
+ goto skip;
+
+ /*
+ * Recheck the syscall, since it may have changed. This
+ * intentionally uses a NULL struct seccomp_data to force
+ * a reload of all registers. This does not goto skip since
+ * a skip would have already been reported.
+ */
+ if (__seccomp_filter(this_syscall, NULL, true))
+ return -1;
+
+ return 0;
case SECCOMP_RET_ALLOW:
- return SECCOMP_PHASE1_OK;
+ return 0;
case SECCOMP_RET_KILL:
default:
@@ -614,96 +644,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
skip:
audit_seccomp(this_syscall, 0, action);
- return SECCOMP_PHASE1_SKIP;
+ return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
+{
+ BUG();
}
#endif
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers. The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked. In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
- int this_syscall = sd ? sd->nr :
- syscall_get_nr(current, task_pt_regs(current));
+ int this_syscall;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
- return SECCOMP_PHASE1_OK;
+ return 0;
+
+ this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
- return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+ return 0;
case SECCOMP_MODE_FILTER:
- return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+ return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
- struct pt_regs *regs = task_pt_regs(current);
- u32 action = phase1_result & SECCOMP_RET_ACTION;
- int data = phase1_result & SECCOMP_RET_DATA;
-
- BUG_ON(action != SECCOMP_RET_TRACE);
-
- audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- return -1;
- }
-
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- do_exit(SIGSYS);
- if (syscall_get_nr(current, regs) < 0)
- return -1; /* Explicit request to skip. */
-
- return 0;
-}
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 96e9bc406..af21afc00 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* @ts: upper bound on process time suspension
*/
int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
- const struct timespec *ts)
+ const struct timespec *ts)
{
+ ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
struct task_struct *tsk = current;
- long timeout = MAX_SCHEDULE_TIMEOUT;
sigset_t mask = *which;
- int sig;
+ int sig, ret = 0;
if (ts) {
if (!timespec_valid(ts))
return -EINVAL;
- timeout = timespec_to_jiffies(ts);
- /*
- * We can be close to the next tick, add another one
- * to ensure we will wait at least the time asked for.
- */
- if (ts->tv_sec || ts->tv_nsec)
- timeout++;
+ timeout = timespec_to_ktime(*ts);
+ to = &timeout;
}
/*
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
- if (!sig && timeout) {
+ if (!sig && timeout.tv64) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- timeout = freezable_schedule_timeout_interruptible(timeout);
-
+ __set_current_state(TASK_INTERRUPTIBLE);
+ ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+ HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
if (sig)
return sig;
- return timeout ? -EINTR : -EAGAIN;
+ return ret ? -EINTR : -EAGAIN;
}
/**
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c..3aa642d39 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
-static int
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+int smpcfd_prepare_cpu(unsigned int cpu)
{
- long cpu = (long)hcpu;
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
- cpu_to_node(cpu)))
- return notifier_from_errno(-ENOMEM);
- cfd->csd = alloc_percpu(struct call_single_data);
- if (!cfd->csd) {
- free_cpumask_var(cfd->cpumask);
- return notifier_from_errno(-ENOMEM);
- }
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- /* Fall-through to the CPU_DEAD[_FROZEN] case. */
-
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
+ if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return -ENOMEM;
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
- free_percpu(cfd->csd);
- break;
+ return -ENOMEM;
+ }
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- /*
- * The IPIs for the smp-call-function callbacks queued by other
- * CPUs might arrive late, either due to hardware latencies or
- * because this CPU disabled interrupts (inside stop-machine)
- * before the IPIs were sent. So flush out any pending callbacks
- * explicitly (without waiting for the IPIs to arrive), to
- * ensure that the outgoing CPU doesn't go offline with work
- * still pending.
- */
- flush_smp_call_function_queue(false);
- break;
-#endif
- };
+ return 0;
+}
+
+int smpcfd_dead_cpu(unsigned int cpu)
+{
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
- return NOTIFY_OK;
+ free_cpumask_var(cfd->cpumask);
+ free_percpu(cfd->csd);
+ return 0;
}
-static struct notifier_block hotplug_cfd_notifier = {
- .notifier_call = hotplug_cfd,
-};
+int smpcfd_dying_cpu(unsigned int cpu)
+{
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ return 0;
+}
void __init call_function_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i)
init_llist_head(&per_cpu(call_single_queue, i));
- hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
- register_cpu_notifier(&hotplug_cfd_notifier);
+ smpcfd_prepare_cpu(smp_processor_id());
}
/*
@@ -107,7 +92,7 @@ void __init call_function_init(void)
*/
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
- smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
static __always_inline void csd_lock(struct call_single_data *csd)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 0bcf0cfb2..fc0d8270f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -174,7 +174,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
if (tsk)
return 0;
- td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td->cpu = cpu;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a467e6c28..4a1ca5f6d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
#include <linux/smpboot.h>
#include <linux/atomic.h>
#include <linux/lglock.h>
+#include <linux/nmi.h>
/*
* Structure to determine completion condition and record errors. May
@@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data)
break;
}
ack_state(msdata);
+ } else if (curstate > MULTI_STOP_PREPARE) {
+ /*
+ * At this stage all other CPUs we depend on must spin
+ * in the same loop. Any reason for hard-lockup should
+ * be detected and reported on their side.
+ */
+ touch_nmi_watchdog();
}
} while (curstate != MULTI_STOP_EXIT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ca4c0640e..ca8093ed7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -821,6 +821,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &ten_thousand,
},
{
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
.procname = "dmesg_restrict",
.data = &dmesg_restrict,
.maxlen = sizeof(int),
@@ -1241,6 +1248,17 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -1533,8 +1551,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
@@ -2158,6 +2176,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2277,8 +2310,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2876,6 +2928,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2921,6 +2979,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index bce3211e7..e056d5429 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
struct callback_head *head;
do {
- head = ACCESS_ONCE(task->task_works);
+ head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
@@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
unsigned long flags;
+
+ if (likely(!task->task_works))
+ return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
* Either we raced with task_work_add() which added the
@@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = ACCESS_ONCE(*pprev))) {
- smp_read_barrier_depends();
+ while ((work = lockless_dereference(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
@@ -95,7 +97,7 @@ void task_work_run(void)
* work_exited unless the list is empty.
*/
do {
- work = ACCESS_ONCE(task->task_works);
+ work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
@@ -108,7 +110,6 @@ void task_work_run(void)
* fail, but it can play with *work and other entries.
*/
raw_spin_unlock_wait(&task->pi_lock);
- smp_mb();
do {
next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867..c3aad685b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @timer: hrtimer used to schedule events while running
* @gettime: Function to read the time correlating to the base
* @base_clockid: clockid for the base
*/
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a403..2c5bc77c0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
#endif
#ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
.name = "clockevents",
.dev_name = "clockevent",
};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a..6a5a310a1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)
struct list_head *entry = &clocksource_list;
struct clocksource *tmp;
- list_for_each_entry(tmp, &clocksource_list, list)
+ list_for_each_entry(tmp, &clocksource_list, list) {
/* Keep track of the place, where to insert */
- if (tmp->rating >= cs->rating)
- entry = &tmp->list;
+ if (tmp->rating < cs->rating)
+ break;
+ entry = &tmp->list;
+ }
list_add(&cs->list, entry);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e99df0ff1..9ba7c820f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
#endif
}
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
/*
* Functions related to boot-time initialization:
*/
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- int scpu = (long)hcpu;
-
- switch (action) {
-
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- init_hrtimers_cpu(scpu);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_hrtimers(scpu);
- break;
-#endif
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block hrtimers_nb = {
- .notifier_call = hrtimer_cpu_notify,
-};
-
void __init hrtimers_init(void)
{
- hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- register_cpu_notifier(&hrtimers_nb);
+ hrtimers_prepare_cpu(smp_processor_id());
}
/**
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365..b0928ab32 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
int allowed_error_ns = usecs * 5;
for (i = 0; i < iters; ++i) {
- struct timespec ts1, ts2;
+ s64 kt1, kt2;
int time_passed;
- ktime_get_ts(&ts1);
+ kt1 = ktime_get_ns();
udelay(usecs);
- ktime_get_ts(&ts2);
- time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+ kt2 = ktime_get_ns();
+ time_passed = kt2 - kt1;
if (i == 0 || time_passed < min)
min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
if (usecs > 0 && iters > 0) {
return udelay_test_single(s, usecs, iters);
} else if (usecs == 0) {
- struct timespec ts;
+ struct timespec64 ts;
- ktime_get_ts(&ts);
- seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
- loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+ ktime_get_ts64(&ts);
+ seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
+ loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
seq_puts(s, "usage:\n");
seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da..690b797f5 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
+ .name = "bc_hrtimer",
.set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fd..f73825100 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f..2ec7c0022 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
#include <trace/events/timer.h>
/*
- * Per cpu nohz control structure
+ * Per-CPU nohz control structure
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta.tv64 < tick_period.tv64)
return;
- /* Reevalute with jiffies_lock held */
+ /* Reevaluate with jiffies_lock held */
write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
- * concurrency: This happens only when the cpu in charge went
- * into a long sleep. If two cpus happen to assign themself to
+ * concurrency: This happens only when the CPU in charge went
+ * into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*/
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
- * perf events, posix cpu timers, ...
+ * perf events, posix CPU timers, ...
*/
void __tick_nohz_task_switch(void)
{
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)
*
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
- * value. We do this unconditionally on any cpu, as we don't know whether the
- * cpu, which has the update task assigned is in a long sleep.
+ * value. We do this unconditionally on any CPU, as we don't know whether the
+ * CPU, which has the update task assigned is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
}
/*
- * Updates the per cpu time idle statistics counters
+ * Updates the per-CPU time idle statistics counters
*/
static void
update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
}
/**
- * get_cpu_idle_time_us - get the total idle time of a cpu
+ * get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
- * Return the cummulative idle time (since boot) for a given
+ * Return the cumulative idle time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
@@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
- * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
- * Return the cummulative iowait time (since boot) for a given
+ * Return the cumulative iowait time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
tick.tv64 = 0;
+
+ /*
+ * Tell the timer code that the base is not idle, i.e. undo
+ * the effect of get_next_timer_interrupt():
+ */
+ timer_clear_idle();
/*
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
@@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
}
/*
- * If this cpu is the one which updates jiffies, then give up
- * the assignment and let it be taken by the cpu which runs
- * the tick timer next, which might be this cpu as well. If we
+ * If this CPU is the one which updates jiffies, then give up
+ * the assignment and let it be taken by the CPU which runs
+ * the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
- * was the one which had the do_timer() duty last. If this cpu
+ * was the one which had the do_timer() duty last. If this CPU
* is the one which had the do_timer() duty last, we limit the
- * sleep time to the timekeeping max_deferement value.
+ * sleep time to the timekeeping max_deferment value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
tick_do_update_jiffies64(now);
cpu_load_update_nohz_stop();
+ /*
+ * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+ * the clock forward checks in the enqueue path:
+ */
+ timer_clear_idle();
+
calc_load_exit_idle();
touch_softlockup_watchdog_sched();
/*
@@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
- * If this cpu is offline and it is the one which updates
+ * If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
- * the cpu which runs the tick timer next. If we don't drop
+ * the CPU which runs the tick timer next. If we don't drop
* this here the jiffies might be stale and do_timer() never
* invoked.
*/
@@ -933,11 +945,11 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(irqs_disabled());
/*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
set_cpu_sd_state_idle();
local_irq_disable();
@@ -1092,35 +1104,6 @@ static void tick_nohz_switch_to_nohz(void)
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
- /* Switch back to 2.6.27 behaviour */
- ktime_t delta;
-
- /*
- * Do not touch the tick device, when the next expiry is either
- * already reached or less/equal than the tick period.
- */
- delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
- if (delta.tv64 <= tick_period.tv64)
- return;
-
- tick_nohz_restart(ts, now);
-#endif
-}
-
static inline void tick_nohz_irq_enter(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1114,8 @@ static inline void tick_nohz_irq_enter(void)
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
- if (ts->tick_stopped) {
+ if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
- tick_nohz_kick_tick(ts, now);
- }
}
#else
@@ -1211,7 +1192,7 @@ void tick_setup_sched_timer(void)
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
- /* Get the next period (per cpu) */
+ /* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
/* Offset the tick to avert jiffies_lock contention. */
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755..7142580ad 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
/**
- * time_to_tm - converts the calendar time to local broken-down time
+ * time64_to_tm - converts the calendar time to local broken-down time
*
* @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
* Coordinated Universal Time (UTC).
* @offset offset seconds adding to totalsecs.
* @result pointer to struct tm variable to receive broken-down time
*/
-void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
long days, rem, y;
+ int remainder;
const unsigned short *ip;
- days = totalsecs / SECS_PER_DAY;
- rem = totalsecs % SECS_PER_DAY;
+ days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
+ rem = remainder;
rem += offset;
while (rem < 0) {
rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
result->tm_mon = y;
result->tm_mday = days + 1;
}
-EXPORT_SYMBOL(time_to_tm);
+EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b6c394563..37dec7e3d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -403,8 +403,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += clocksource_delta(tkr->read(tkr->clock),
- tkr->cycle_last, tkr->mask);
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -483,10 +486,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* users are removed, this can be killed.
*/
remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
- tk->tkr_mono.xtime_nsec -= remainder;
- tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+ if (remainder != 0) {
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+ }
}
#else
#define old_vsyscall_fixup(tk)
@@ -2189,6 +2194,7 @@ struct timespec64 get_monotonic_coarse64(void)
return now;
}
+EXPORT_SYMBOL(get_monotonic_coarse64);
/*
* Must hold jiffies_lock
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728..32bf6f75a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
/*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is: LVL_CLK_DIV ^ lvl
+ * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset Granularity Range
+ * 0 0 1 ms 0 ms - 63 ms
+ * 1 64 8 ms 64 ms - 511 ms
+ * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
+ * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
+ * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
+ * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
+ * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
+ * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
+ * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ 300
+ * Level Offset Granularity Range
+ * 0 0 3 ms 0 ms - 210 ms
+ * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
+ * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
+ * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
+ * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
+ * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
+ * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
+ * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
+ * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ 250
+ * Level Offset Granularity Range
+ * 0 0 4 ms 0 ms - 255 ms
+ * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
+ * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
+ * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
+ * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
+ * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
+ * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
+ * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
+ * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ 100
+ * Level Offset Granularity Range
+ * 0 0 10 ms 0 ms - 630 ms
+ * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
+ * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
+ * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
+ * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
+ * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
+ * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
+ * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
*/
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-
-struct tvec {
- struct hlist_head vec[TVN_SIZE];
-};
-struct tvec_root {
- struct hlist_head vec[TVR_SIZE];
-};
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT 3
+#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
-struct tvec_base {
- spinlock_t lock;
- struct timer_list *running_timer;
- unsigned long timer_jiffies;
- unsigned long next_timer;
- unsigned long active_timers;
- unsigned long all_timers;
- int cpu;
- bool migration_enabled;
- bool nohz_active;
- struct tvec_root tv1;
- struct tvec tv2;
- struct tvec tv3;
- struct tvec tv4;
- struct tvec tv5;
-} ____cacheline_aligned;
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+
+/* Size of each clock level */
+#define LVL_BITS 6
+#define LVL_SIZE (1UL << LVL_BITS)
+#define LVL_MASK (LVL_SIZE - 1)
+#define LVL_OFFS(n) ((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH 9
+# else
+# define LVL_DEPTH 8
+#endif
+
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES 2
+# define BASE_STD 0
+# define BASE_DEF 1
+#else
+# define NR_BASES 1
+# define BASE_STD 0
+# define BASE_DEF 0
+#endif
+struct timer_base {
+ spinlock_t lock;
+ struct timer_list *running_timer;
+ unsigned long clk;
+ unsigned long next_expiry;
+ unsigned int cpu;
+ bool migration_enabled;
+ bool nohz_active;
+ bool is_idle;
+ DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+ struct hlist_head vectors[WHEEL_SIZE];
+} ____cacheline_aligned;
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
unsigned int sysctl_timer_migration = 1;
@@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)
unsigned int cpu;
/* Avoid the loop, if nothing to update */
- if (this_cpu_read(tvec_bases.migration_enabled) == on)
+ if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
return;
for_each_possible_cpu(cpu) {
- per_cpu(tvec_bases.migration_enabled, cpu) = on;
+ per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+ per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
if (!update_nohz)
continue;
- per_cpu(tvec_bases.nohz_active, cpu) = true;
+ per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+ per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
per_cpu(hrtimer_bases.nohz_active, cpu) = true;
}
}
@@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
mutex_unlock(&mutex);
return ret;
}
-
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return this_cpu_ptr(&tvec_bases);
- return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
-}
-#else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
-{
- return this_cpu_ptr(&tvec_bases);
-}
#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
+
+static inline unsigned int timer_get_idx(struct timer_list *timer)
{
- timer->slack = slack_hz;
+ return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}
-EXPORT_SYMBOL_GPL(set_timer_slack);
-static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
- unsigned long expires = timer->expires;
- unsigned long idx = expires - base->timer_jiffies;
- struct hlist_head *vec;
+ timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+ idx << TIMER_ARRAYSHIFT;
+}
- if (idx < TVR_SIZE) {
- int i = expires & TVR_MASK;
- vec = base->tv1.vec + i;
- } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
- int i = (expires >> TVR_BITS) & TVN_MASK;
- vec = base->tv2.vec + i;
- } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
- vec = base->tv3.vec + i;
- } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
- vec = base->tv4.vec + i;
- } else if ((signed long) idx < 0) {
- /*
- * Can happen if you add a timer with expires == jiffies,
- * or you set a timer to go off in the past
- */
- vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
+{
+ unsigned long delta = expires - clk;
+ unsigned int idx;
+
+ if (delta < LVL_START(1)) {
+ idx = calc_index(expires, 0);
+ } else if (delta < LVL_START(2)) {
+ idx = calc_index(expires, 1);
+ } else if (delta < LVL_START(3)) {
+ idx = calc_index(expires, 2);
+ } else if (delta < LVL_START(4)) {
+ idx = calc_index(expires, 3);
+ } else if (delta < LVL_START(5)) {
+ idx = calc_index(expires, 4);
+ } else if (delta < LVL_START(6)) {
+ idx = calc_index(expires, 5);
+ } else if (delta < LVL_START(7)) {
+ idx = calc_index(expires, 6);
+ } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+ idx = calc_index(expires, 7);
+ } else if ((long) delta < 0) {
+ idx = clk & LVL_MASK;
} else {
- int i;
- /* If the timeout is larger than MAX_TVAL (on 64-bit
- * architectures or with CONFIG_BASE_SMALL=1) then we
- * use the maximum timeout.
+ /*
+ * Force expire obscene large timeouts to expire at the
+ * capacity limit of the wheel.
*/
- if (idx > MAX_TVAL) {
- idx = MAX_TVAL;
- expires = idx + base->timer_jiffies;
- }
- i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
- vec = base->tv5.vec + i;
+ if (expires >= WHEEL_TIMEOUT_CUTOFF)
+ expires = WHEEL_TIMEOUT_MAX;
+
+ idx = calc_index(expires, LVL_DEPTH - 1);
}
+ return idx;
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx)
+{
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+}
+
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+ unsigned int idx;
- hlist_add_head(&timer->entry, vec);
+ idx = calc_wheel_index(timer->expires, base->clk);
+ enqueue_timer(base, timer, idx);
}
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- /* Advance base->jiffies, if the base is empty */
- if (!base->all_timers++)
- base->timer_jiffies = jiffies;
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ return;
- __internal_add_timer(base, timer);
/*
- * Update base->active_timers and base->next_timer
+ * TODO: This wants some optimizing similar to the code below, but we
+ * will do that when we switch from push to pull for deferrable timers.
*/
- if (!(timer->flags & TIMER_DEFERRABLE)) {
- if (!base->active_timers++ ||
- time_before(timer->expires, base->next_timer))
- base->next_timer = timer->expires;
+ if (timer->flags & TIMER_DEFERRABLE) {
+ if (tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+ return;
}
/*
- * Check whether the other CPU is in dynticks mode and needs
- * to be triggered to reevaluate the timer wheel.
- * We are protected against the other CPU fiddling
- * with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to stop its tick can not
- * evaluate the timer wheel.
- *
- * Spare the IPI for deferrable timers on idle targets though.
- * The next busy ticks will take care of it. Except full dynticks
- * require special care against races with idle_cpu(), lets deal
- * with that later.
+ * We might have to IPI the remote CPU if the base is idle and the
+ * timer is not deferrable. If the other CPU is on the way to idle
+ * then it can't set base->is_idle as we hold the base lock:
*/
- if (base->nohz_active) {
- if (!(timer->flags & TIMER_DEFERRABLE) ||
- tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
- }
+ if (!base->is_idle)
+ return;
+
+ /* Check whether this is the new first expiring timer: */
+ if (time_after_eq(timer->expires, base->next_expiry))
+ return;
+
+ /*
+ * Set the next expiry time and kick the CPU so it can reevaluate the
+ * wheel:
+ */
+ base->next_expiry = timer->expires;
+ wake_up_nohz_cpu(base->cpu);
+}
+
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+ __internal_add_timer(base, timer);
+ trigger_dyntick_cpu(base, timer);
}
#ifdef CONFIG_TIMER_STATS
@@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
{
timer->entry.pprev = NULL;
timer->flags = flags | raw_smp_processor_id();
- timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
timer->start_pid = -1;
@@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
entry->next = LIST_POISON2;
}
-static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
-{
- detach_timer(timer, true);
- if (!(timer->flags & TIMER_DEFERRABLE))
- base->active_timers--;
- base->all_timers--;
-}
-
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
bool clear_pending)
{
+ unsigned idx = timer_get_idx(timer);
+
if (!timer_pending(timer))
return 0;
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ __clear_bit(idx, base->pending_map);
+
detach_timer(timer, clear_pending);
- if (!(timer->flags & TIMER_DEFERRABLE)) {
- base->active_timers--;
- if (timer->expires == base->next_timer)
- base->next_timer = base->timer_jiffies;
- }
- /* If this was the last timer, advance base->jiffies */
- if (!--base->all_timers)
- base->timer_jiffies = jiffies;
return 1;
}
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+ struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+ /*
+ * If the timer is deferrable and nohz is active then we need to use
+ * the deferrable base.
+ */
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ (tflags & TIMER_DEFERRABLE))
+ base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+ return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+ /*
+ * If the timer is deferrable and nohz is active then we need to use
+ * the deferrable base.
+ */
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ (tflags & TIMER_DEFERRABLE))
+ base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+ return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+ return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+#ifdef CONFIG_SMP
+ if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+ return get_timer_this_cpu_base(tflags);
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+ return get_timer_this_cpu_base(tflags);
+#endif
+}
+
+static inline void forward_timer_base(struct timer_base *base)
+{
+ /*
+ * We only forward the base when it's idle and we have a delta between
+ * base clock and jiffies.
+ */
+ if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+ return;
+
+ /*
+ * If the next expiry value is > jiffies, then we fast forward to
+ * jiffies otherwise we forward to the next expiry value.
+ */
+ if (time_after(base->next_expiry, jiffies))
+ base->clk = jiffies;
+ else
+ base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+ return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+
+static inline struct timer_base *
+get_target_base(struct timer_base *base, unsigned tflags)
+{
+ struct timer_base *target = __get_target_base(base, tflags);
+
+ forward_timer_base(target);
+ return target;
+}
+
/*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
*
* So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
*
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
*/
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
- unsigned long *flags)
+static struct timer_base *lock_timer_base(struct timer_list *timer,
+ unsigned long *flags)
__acquires(timer->base->lock)
{
for (;;) {
+ struct timer_base *base;
u32 tf = timer->flags;
- struct tvec_base *base;
if (!(tf & TIMER_MIGRATING)) {
- base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+ base = get_timer_base(tf);
spin_lock_irqsave(&base->lock, *flags);
if (timer->flags == tf)
return base;
@@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
- bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
{
- struct tvec_base *base, *new_base;
- unsigned long flags;
+ struct timer_base *base, *new_base;
+ unsigned int idx = UINT_MAX;
+ unsigned long clk = 0, flags;
int ret = 0;
+ /*
+ * This is a common optimization triggered by the networking code - if
+ * the timer is re-modified to have the same timeout or ends up in the
+ * same array bucket then just return:
+ */
+ if (timer_pending(timer)) {
+ if (timer->expires == expires)
+ return 1;
+ /*
+ * Take the current timer_jiffies of base, but without holding
+ * the lock!
+ */
+ base = get_timer_base(timer->flags);
+ clk = base->clk;
+
+ idx = calc_wheel_index(expires, clk);
+
+ /*
+ * Retrieve and compare the array index of the pending
+ * timer. If it matches set the expiry to the new value so a
+ * subsequent call will exit in the expires check above.
+ */
+ if (idx == timer_get_idx(timer)) {
+ timer->expires = expires;
+ return 1;
+ }
+ }
+
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- new_base = get_target_base(base, pinned);
+ new_base = get_target_base(base, timer->flags);
if (base != new_base) {
/*
- * We are trying to schedule the timer on the local CPU.
+ * We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
* otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
+ * handler yet has not finished. This also guarantees that the
+ * timer is serialized wrt itself.
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
@@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
- internal_add_timer(base, timer);
+ /*
+ * If 'idx' was calculated above and the base time did not advance
+ * between calculating 'idx' and taking the lock, only enqueue_timer()
+ * and trigger_dyntick_cpu() is required. Otherwise we need to
+ * (re)calculate the wheel index via internal_add_timer().
+ */
+ if (idx != UINT_MAX && clk == base->clk) {
+ enqueue_timer(base, timer, idx);
+ trigger_dyntick_cpu(base, timer);
+ } else {
+ internal_add_timer(base, timer);
+ }
out_unlock:
spin_unlock_irqrestore(&base->lock, flags);
@@ -825,49 +1057,10 @@ out_unlock:
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+ return __mod_timer(timer, expires, true);
}
EXPORT_SYMBOL(mod_timer_pending);
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- * 1) calculate the maximum (absolute) time
- * 2) calculate the highest bit where the expires and new max are different
- * 3) use this bit to make a mask
- * 4) use the bitmask to round down the maximum time, so that all last
- * bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
- unsigned long expires_limit, mask;
- int bit;
-
- if (timer->slack >= 0) {
- expires_limit = expires + timer->slack;
- } else {
- long delta = expires - jiffies;
-
- if (delta < 256)
- return expires;
-
- expires_limit = expires + delta / 256;
- }
- mask = expires ^ expires_limit;
- if (mask == 0)
- return expires;
-
- bit = __fls(mask);
-
- mask = (1UL << bit) - 1;
-
- expires_limit = expires_limit & ~(mask);
-
- return expires_limit;
-}
-
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
@@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
- expires = apply_slack(timer, expires);
-
- /*
- * This is a common optimization triggered by the
- * networking code - if the timer is re-modified
- * to be the same thing then just return:
- */
- if (timer_pending(timer) && timer->expires == expires)
- return 1;
-
- return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+ return __mod_timer(timer, expires, false);
}
EXPORT_SYMBOL(mod_timer);
/**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline. If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- * del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
- if (timer->expires == expires && timer_pending(timer))
- return 1;
-
- return __mod_timer(timer, expires, false, TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
-/**
* add_timer - start a timer
* @timer: the timer to be added
*
@@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
- struct tvec_base *base;
+ struct timer_base *new_base, *base;
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
+ new_base = get_timer_cpu_base(timer->flags, cpu);
+
/*
* If @timer was on a different CPU, it should be migrated with the
* old base locked to prevent other operations proceeding with the
@@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
*/
int del_timer(struct timer_list *timer)
{
- struct tvec_base *base;
+ struct timer_base *base;
unsigned long flags;
int ret = 0;
@@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
- struct tvec_base *base;
+ struct timer_base *base;
unsigned long flags;
int ret = -1;
@@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
-{
- /* cascade all the timers from tv up one level */
- struct timer_list *timer;
- struct hlist_node *tmp;
- struct hlist_head tv_list;
-
- hlist_move_list(tv->vec + index, &tv_list);
-
- /*
- * We are removing _all_ timers from the list, so we
- * don't have to detach them individually.
- */
- hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- /* No accounting, while moving them */
- __internal_add_timer(base, timer);
- }
-
- return index;
-}
-
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
@@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
}
}
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
- */
-static inline void __run_timers(struct tvec_base *base)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
- struct timer_list *timer;
+ while (!hlist_empty(head)) {
+ struct timer_list *timer;
+ void (*fn)(unsigned long);
+ unsigned long data;
- spin_lock_irq(&base->lock);
+ timer = hlist_entry(head->first, struct timer_list, entry);
+ timer_stats_account_timer(timer);
- while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct hlist_head work_list;
- struct hlist_head *head = &work_list;
- int index;
+ base->running_timer = timer;
+ detach_timer(timer, true);
- if (!base->all_timers) {
- base->timer_jiffies = jiffies;
- break;
+ fn = timer->function;
+ data = timer->data;
+
+ if (timer->flags & TIMER_IRQSAFE) {
+ spin_unlock(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock(&base->lock);
+ } else {
+ spin_unlock_irq(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock_irq(&base->lock);
}
+ }
+}
- index = base->timer_jiffies & TVR_MASK;
+static int __collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ unsigned long clk = base->clk;
+ struct hlist_head *vec;
+ int i, levels = 0;
+ unsigned int idx;
- /*
- * Cascade timers:
- */
- if (!index &&
- (!cascade(base, &base->tv2, INDEX(0))) &&
- (!cascade(base, &base->tv3, INDEX(1))) &&
- !cascade(base, &base->tv4, INDEX(2)))
- cascade(base, &base->tv5, INDEX(3));
- ++base->timer_jiffies;
- hlist_move_list(base->tv1.vec + index, head);
- while (!hlist_empty(head)) {
- void (*fn)(unsigned long);
- unsigned long data;
- bool irqsafe;
-
- timer = hlist_entry(head->first, struct timer_list, entry);
- fn = timer->function;
- data = timer->data;
- irqsafe = timer->flags & TIMER_IRQSAFE;
-
- timer_stats_account_timer(timer);
-
- base->running_timer = timer;
- detach_expired_timer(timer, base);
-
- if (irqsafe) {
- spin_unlock(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock(&base->lock);
- } else {
- spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock_irq(&base->lock);
- }
+ for (i = 0; i < LVL_DEPTH; i++) {
+ idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+ if (__test_and_clear_bit(idx, base->pending_map)) {
+ vec = base->vectors + idx;
+ hlist_move_list(vec, heads++);
+ levels++;
}
+ /* Is it time to look at the next level? */
+ if (clk & LVL_CLK_MASK)
+ break;
+ /* Shift clock for the next level granularity */
+ clk >>= LVL_CLK_SHIFT;
}
- base->running_timer = NULL;
- spin_unlock_irq(&base->lock);
+ return levels;
}
#ifdef CONFIG_NO_HZ_COMMON
/*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+ unsigned clk)
+{
+ unsigned pos, start = offset + clk;
+ unsigned end = offset + LVL_SIZE;
+
+ pos = find_next_bit(base->pending_map, end, start);
+ if (pos < end)
+ return pos - start;
+
+ pos = find_next_bit(base->pending_map, start, offset);
+ return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
*/
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
-{
- unsigned long timer_jiffies = base->timer_jiffies;
- unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
- int index, slot, array, found = 0;
- struct timer_list *nte;
- struct tvec *varray[4];
-
- /* Look for timer events in tv1. */
- index = slot = timer_jiffies & TVR_MASK;
- do {
- hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
- if (nte->flags & TIMER_DEFERRABLE)
- continue;
-
- found = 1;
- expires = nte->expires;
- /* Look at the cascade bucket(s)? */
- if (!index || slot < index)
- goto cascade;
- return expires;
+static unsigned long __next_timer_interrupt(struct timer_base *base)
+{
+ unsigned long clk, next, adj;
+ unsigned lvl, offset = 0;
+
+ next = base->clk + NEXT_TIMER_MAX_DELTA;
+ clk = base->clk;
+ for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+ int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+ if (pos >= 0) {
+ unsigned long tmp = clk + (unsigned long) pos;
+
+ tmp <<= LVL_SHIFT(lvl);
+ if (time_before(tmp, next))
+ next = tmp;
}
- slot = (slot + 1) & TVR_MASK;
- } while (slot != index);
-
-cascade:
- /* Calculate the next cascade event */
- if (index)
- timer_jiffies += TVR_SIZE - index;
- timer_jiffies >>= TVR_BITS;
-
- /* Check tv2-tv5. */
- varray[0] = &base->tv2;
- varray[1] = &base->tv3;
- varray[2] = &base->tv4;
- varray[3] = &base->tv5;
-
- for (array = 0; array < 4; array++) {
- struct tvec *varp = varray[array];
-
- index = slot = timer_jiffies & TVN_MASK;
- do {
- hlist_for_each_entry(nte, varp->vec + slot, entry) {
- if (nte->flags & TIMER_DEFERRABLE)
- continue;
-
- found = 1;
- if (time_before(nte->expires, expires))
- expires = nte->expires;
- }
- /*
- * Do we still search for the first timer or are
- * we looking up the cascade buckets ?
- */
- if (found) {
- /* Look at the cascade bucket(s)? */
- if (!index || slot < index)
- break;
- return expires;
- }
- slot = (slot + 1) & TVN_MASK;
- } while (slot != index);
-
- if (index)
- timer_jiffies += TVN_SIZE - index;
- timer_jiffies >>= TVN_BITS;
+ /*
+ * Clock for the next level. If the current level clock lower
+ * bits are zero, we look at the next level as is. If not we
+ * need to advance it by one because that's going to be the
+ * next expiring bucket in that level. base->clk is the next
+ * expiring jiffie. So in case of:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 0 0
+ *
+ * we have to look at all levels @index 0. With
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 0 2
+ *
+ * LVL0 has the next expiring bucket @index 2. The upper
+ * levels have the next expiring bucket @index 1.
+ *
+ * In case that the propagation wraps the next level the same
+ * rules apply:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 F 2
+ *
+ * So after looking at LVL0 we get:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1
+ * 0 0 0 1 0
+ *
+ * So no propagation from LVL1 to LVL2 because that happened
+ * with the add already, but then we need to propagate further
+ * from LVL2 to LVL3.
+ *
+ * So the simple check whether the lower bits of the current
+ * level are 0 or not is sufficient for all cases.
+ */
+ adj = clk & LVL_CLK_MASK ? 1 : 0;
+ clk >>= LVL_CLK_SHIFT;
+ clk += adj;
}
- return expires;
+ return next;
}
/*
@@ -1364,9 +1493,10 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
*/
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
+ bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1376,19 +1506,82 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
spin_lock(&base->lock);
- if (base->active_timers) {
- if (time_before_eq(base->next_timer, base->timer_jiffies))
- base->next_timer = __next_timer_interrupt(base);
- nextevt = base->next_timer;
- if (time_before_eq(nextevt, basej))
- expires = basem;
- else
+ nextevt = __next_timer_interrupt(base);
+ is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->next_expiry = nextevt;
+ /*
+ * We have a fresh next event. Check whether we can forward the base:
+ */
+ if (time_after(nextevt, jiffies))
+ base->clk = jiffies;
+ else if (time_after(nextevt, base->clk))
+ base->clk = nextevt;
+
+ if (time_before_eq(nextevt, basej)) {
+ expires = basem;
+ base->is_idle = false;
+ } else {
+ if (!is_max_delta)
expires = basem + (nextevt - basej) * TICK_NSEC;
+ /*
+ * If we expect to sleep more than a tick, mark the base idle:
+ */
+ if ((expires - basem) > TICK_NSEC)
+ base->is_idle = true;
}
spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
}
+
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+ /*
+ * We do this unlocked. The worst outcome is a remote enqueue sending
+ * a pointless IPI, but taking the lock would just make the window for
+ * sending the IPI a few instructions smaller for the cost of taking
+ * the lock in the exit from idle path.
+ */
+ base->is_idle = false;
+}
+
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ /*
+ * NOHZ optimization. After a long idle sleep we need to forward the
+ * base to current jiffies. Avoid a loop by searching the bitfield for
+ * the next expiring timer.
+ */
+ if ((long)(jiffies - base->clk) > 2) {
+ unsigned long next = __next_timer_interrupt(base);
+
+ /*
+ * If the next timer is ahead of time forward to current
+ * jiffies, otherwise forward to the next expiry time:
+ */
+ if (time_after(next, jiffies)) {
+ /* The call site will increment clock! */
+ base->clk = jiffies - 1;
+ return 0;
+ }
+ base->clk = next;
+ }
+ return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ return __collect_expired_timers(base, heads);
+}
#endif
/*
@@ -1411,15 +1604,42 @@ void update_process_times(int user_tick)
run_posix_cpu_timers(p);
}
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+ struct hlist_head heads[LVL_DEPTH];
+ int levels;
+
+ if (!time_after_eq(jiffies, base->clk))
+ return;
+
+ spin_lock_irq(&base->lock);
+
+ while (time_after_eq(jiffies, base->clk)) {
+
+ levels = collect_expired_timers(base, heads);
+ base->clk++;
+
+ while (levels--)
+ expire_timers(base, heads + levels);
+ }
+ base->running_timer = NULL;
+ spin_unlock_irq(&base->lock);
+}
+
/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
- if (time_after_eq(jiffies, base->timer_jiffies))
- __run_timers(base);
+ __run_timers(base);
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
/*
@@ -1427,7 +1647,18 @@ static void run_timer_softirq(struct softirq_action *h)
*/
void run_local_timers(void)
{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
hrtimer_run_queues();
+ /* Raise the softirq only if required. */
+ if (time_before(jiffies, base->clk)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ return;
+ /* CPU is awake, so check the deferrable base. */
+ base++;
+ if (time_before(jiffies, base->clk))
+ return;
+ }
raise_softirq(TIMER_SOFTIRQ);
}
@@ -1512,7 +1743,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+ __mod_timer(&timer, expire, false);
schedule();
del_singleshot_timer_sync(&timer);
@@ -1563,87 +1794,62 @@ signed long __sched schedule_timeout_idle(signed long timeout)
EXPORT_SYMBOL(schedule_timeout_idle);
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
int cpu = new_base->cpu;
while (!hlist_empty(head)) {
timer = hlist_entry(head->first, struct timer_list, entry);
- /* We ignore the accounting on the dying cpu */
detach_timer(timer, false);
timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
{
- struct tvec_base *old_base;
- struct tvec_base *new_base;
- int i;
+ struct timer_base *old_base;
+ struct timer_base *new_base;
+ int b, i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu_ptr(&tvec_bases, cpu);
- new_base = get_cpu_ptr(&tvec_bases);
- /*
- * The caller is globally serialized and nobody else
- * takes two locks at once, deadlock is not possible.
- */
- spin_lock_irq(&new_base->lock);
- spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-
- BUG_ON(old_base->running_timer);
-
- for (i = 0; i < TVR_SIZE; i++)
- migrate_timer_list(new_base, old_base->tv1.vec + i);
- for (i = 0; i < TVN_SIZE; i++) {
- migrate_timer_list(new_base, old_base->tv2.vec + i);
- migrate_timer_list(new_base, old_base->tv3.vec + i);
- migrate_timer_list(new_base, old_base->tv4.vec + i);
- migrate_timer_list(new_base, old_base->tv5.vec + i);
- }
- old_base->active_timers = 0;
- old_base->all_timers = 0;
+ for (b = 0; b < NR_BASES; b++) {
+ old_base = per_cpu_ptr(&timer_bases[b], cpu);
+ new_base = get_cpu_ptr(&timer_bases[b]);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
- spin_unlock(&old_base->lock);
- spin_unlock_irq(&new_base->lock);
- put_cpu_ptr(&tvec_bases);
-}
+ BUG_ON(old_base->running_timer);
-static int timer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_timers((long)hcpu);
- break;
- default:
- break;
- }
+ for (i = 0; i < WHEEL_SIZE; i++)
+ migrate_timer_list(new_base, old_base->vectors + i);
- return NOTIFY_OK;
+ spin_unlock(&old_base->lock);
+ spin_unlock_irq(&new_base->lock);
+ put_cpu_ptr(&timer_bases);
+ }
+ return 0;
}
-static inline void timer_register_cpu_notifier(void)
-{
- cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
#endif /* CONFIG_HOTPLUG_CPU */
static void __init init_timer_cpu(int cpu)
{
- struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
-
- base->cpu = cpu;
- spin_lock_init(&base->lock);
+ struct timer_base *base;
+ int i;
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
+ for (i = 0; i < NR_BASES; i++) {
+ base = per_cpu_ptr(&timer_bases[i], cpu);
+ base->cpu = cpu;
+ spin_lock_init(&base->lock);
+ base->clk = jiffies;
+ }
}
static void __init init_timer_cpus(void)
@@ -1658,7 +1864,6 @@ void __init init_timers(void)
{
init_timer_cpus();
init_timer_stats();
- timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -1702,9 +1907,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
}
/**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * usleep_range - Sleep for an approximate time
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() instead of udelay(). The sleep improves responsiveness
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
+ * power usage by allowing hrtimers to take advantage of an already-
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
*/
void __sched usleep_range(unsigned long min, unsigned long max)
{
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b8..087204c73 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
static int tstats_show(struct seq_file *m, void *v)
{
- struct timespec period;
+ struct timespec64 period;
struct entry *entry;
unsigned long ms;
long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
time = ktime_sub(time_stop, time_start);
- period = ktime_to_timespec(time);
+ period = ktime_to_timespec64(time);
ms = period.tv_nsec / 1000000;
seq_puts(m, "Timer Stats Version: v0.3\n");
- seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
if (atomic_read(&overflow_count))
seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
diff --git a/kernel/torture.c b/kernel/torture.c
index fa0bdeee1..75961b3de 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -82,6 +82,104 @@ static int min_online = -1;
static int max_online;
/*
+ * Attempt to take a CPU offline. Return false if the CPU is already
+ * offline or if it is not subject to CPU-hotplug operations. The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
+ unsigned long *sum_offl, int *min_offl, int *max_offl)
+{
+ unsigned long delta;
+ int ret;
+ unsigned long starttime;
+
+ if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+ return false;
+
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ (*n_offl_attempts)++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ (*n_offl_successes)++;
+ delta = jiffies - starttime;
+ sum_offl += delta;
+ if (*min_offl < 0) {
+ *min_offl = delta;
+ *max_offl = delta;
+ }
+ if (*min_offl > delta)
+ *min_offl = delta;
+ if (*max_offl < delta)
+ *max_offl = delta;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_offline);
+
+/*
+ * Attempt to bring a CPU online. Return false if the CPU is already
+ * online or if it is not subject to CPU-hotplug operations. The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
+ unsigned long *sum_onl, int *min_onl, int *max_onl)
+{
+ unsigned long delta;
+ int ret;
+ unsigned long starttime;
+
+ if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+ return false;
+
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ (*n_onl_attempts)++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ (*n_onl_successes)++;
+ delta = jiffies - starttime;
+ *sum_onl += delta;
+ if (*min_onl < 0) {
+ *min_onl = delta;
+ *max_onl = delta;
+ }
+ if (*min_onl > delta)
+ *min_onl = delta;
+ if (*max_onl < delta)
+ *max_onl = delta;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_online);
+
+/*
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
@@ -89,16 +187,19 @@ static int
torture_onoff(void *arg)
{
int cpu;
- unsigned long delta;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
- int ret;
- unsigned long starttime;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
+
+ if (maxcpu == 0) {
+ VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
+ goto stop;
+ }
+
if (onoff_holdoff > 0) {
VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
schedule_timeout_interruptible(onoff_holdoff);
@@ -106,69 +207,16 @@ torture_onoff(void *arg)
}
while (!torture_must_stop()) {
cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
- if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_offline_attempts++;
- ret = cpu_down(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offline %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offlined %d\n",
- torture_type, cpu);
- n_offline_successes++;
- delta = jiffies - starttime;
- sum_offline += delta;
- if (min_offline < 0) {
- min_offline = delta;
- max_offline = delta;
- }
- if (min_offline > delta)
- min_offline = delta;
- if (max_offline < delta)
- max_offline = delta;
- }
- } else if (cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: onlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_online_attempts++;
- ret = cpu_up(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: online %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: onlined %d\n",
- torture_type, cpu);
- n_online_successes++;
- delta = jiffies - starttime;
- sum_online += delta;
- if (min_online < 0) {
- min_online = delta;
- max_online = delta;
- }
- if (min_online > delta)
- min_online = delta;
- if (max_online < delta)
- max_online = delta;
- }
- }
+ if (!torture_offline(cpu,
+ &n_offline_attempts, &n_offline_successes,
+ &sum_offline, &min_offline, &max_offline))
+ torture_online(cpu,
+ &n_online_attempts, &n_online_successes,
+ &sum_online, &min_online, &max_online);
schedule_timeout_interruptible(onoff_interval);
}
+
+stop:
torture_kthread_stopping("torture_onoff");
return 0;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803..f4b86e8ca 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
+ select TRACING
default n
help
Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e..dbafc5df0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
static void trace_note_time(struct blk_trace *bt)
{
- struct timespec now;
+ struct timespec64 now;
unsigned long flags;
u32 words[2];
- getnstimeofday(&now);
- words[0] = now.tv_sec;
+ /* need to check user space to see if this breaks in y2038 or y2106 */
+ ktime_get_real_ts64(&now);
+ words[0] = (u32)now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
#define BLK_TC_RAHEAD BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
+ int op, int op_flags, u32 what, int error, int pdu_len,
+ void *pdu_data)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
- what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, SYNC);
- what |= MASK_TC_BIT(rw, RAHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
- what |= MASK_TC_BIT(rw, FLUSH);
- what |= MASK_TC_BIT(rw, FUA);
+ what |= ddir_act[op_is_write(op) ? WRITE : READ];
+ what |= MASK_TC_BIT(op_flags, SYNC);
+ what |= MASK_TC_BIT(op_flags, RAHEAD);
+ what |= MASK_TC_BIT(op_flags, META);
+ what |= MASK_TC_BIT(op_flags, PREFLUSH);
+ what |= MASK_TC_BIT(op_flags, FUA);
+ if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
+ what |= BLK_TC_ACT(BLK_TC_DISCARD);
+ if (op == REQ_OP_FLUSH)
+ what |= BLK_TC_ACT(BLK_TC_FLUSH);
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+ NULL);
}
}
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL);
}
}
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- bio->bi_error, sizeof(rpdu), &rpdu);
+ bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
+ BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+ &rpdu);
}
}
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
}
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r);
}
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
return;
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
}
}
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
{
int i = 0;
- if (rw & REQ_FLUSH)
+ if (rw & REQ_PREFLUSH)
rwbs[i++] = 'F';
- if (rw & WRITE)
+ switch (op) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
- else if (rw & REQ_DISCARD)
+ break;
+ case REQ_OP_DISCARD:
+ rwbs[i++] = 'D';
+ break;
+ case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
- else if (bytes)
+ rwbs[i++] = 'E';
+ break;
+ case REQ_OP_FLUSH:
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_READ:
rwbs[i++] = 'R';
- else
+ break;
+ default:
rwbs[i++] = 'N';
+ }
if (rw & REQ_FUA)
rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i++] = 'S';
if (rw & REQ_META)
rwbs[i++] = 'M';
- if (rw & REQ_SECURE)
- rwbs[i++] = 'E';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 26f603da7..b20438fdb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *src = (void *) (long) r2;
+ int size = (int) r3;
+
+ /*
+ * Ensure we're in user context which is safe for the helper to
+ * run. This helper has no business in a kthread.
+ *
+ * access_ok() should prevent writing to non-user memory, but in
+ * some situations (nommu, temporary switch, etc) access_ok() does
+ * not provide enough validation, hence the check on KERNEL_DS.
+ */
+
+ if (unlikely(in_interrupt() ||
+ current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ return -EPERM;
+ if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ return -EPERM;
+
+ return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+ .func = bpf_probe_write_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+ pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+ current->comm, task_pid_nr(current));
+
+ return &bpf_probe_write_user_proto;
+}
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,31 +231,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
+ u64 index = flags & BPF_F_INDEX_MASK;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
- /* make sure event is local and doesn't have pmu::count */
- if (event->oncpu != smp_processor_id() ||
- event->pmu->count)
- return -EINVAL;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
event->attr.type != PERF_TYPE_RAW))
return -EINVAL;
+ /* make sure event is local and doesn't have pmu::count */
+ if (unlikely(event->oncpu != cpu || event->pmu->count))
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
@@ -229,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+ u64 flags, struct perf_raw_record *raw)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
- struct perf_raw_record raw = {
- .size = size,
- .data = data,
- };
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
- index = raw_smp_processor_id();
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
- if (unlikely(event->oncpu != smp_processor_id()))
+ if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = &raw;
+ sample_data.raw = raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *)(long) r1;
+ struct bpf_map *map = (struct bpf_map *)(long) r2;
+ void *data = (void *)(long) r4;
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = size,
+ .data = data,
+ },
+ };
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
@@ -283,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ struct perf_raw_frag frag = {
+ .copy = ctx_copy,
+ .size = ctx_size,
+ .data = ctx,
+ };
+ struct perf_raw_record raw = {
+ .frag = {
+ {
+ .next = ctx_size ? &frag : NULL,
+ },
+ .size = meta_size,
+ .data = meta,
+ },
+ };
perf_fetch_caller_regs(regs);
- return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return (long) current;
}
-static const struct bpf_func_proto bpf_event_output_proto = {
- .func = bpf_event_output,
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+ .func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
};
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
- return &bpf_event_output_proto;
-}
-
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -325,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -335,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_probe_write_user:
+ return bpf_get_probe_write_proto();
default:
return NULL;
}
@@ -356,18 +428,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
-
- /* only read is allowed */
if (type != BPF_READ)
return false;
-
- /* disallow misaligned access */
if (off % size != 0)
return false;
-
return true;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1ef..84752c8e2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
- struct list_head list;
- struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
- return !list_empty(&ftrace_pids);
+ struct trace_array *tr;
+
+ if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+ return false;
+
+ tr = ops->private;
+
+ return tr->function_pids != NULL;
}
static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (!test_tsk_trace_trace(current))
+ struct trace_array *tr = op->private;
+
+ if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
- if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+ if (ftrace_pids_enabled(ops))
ops->func = ftrace_pid_func;
ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
static void ftrace_update_pid_func(void)
{
- bool enabled = ftrace_pids_enabled();
struct ftrace_ops *op;
/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (op->flags & FTRACE_OPS_FL_PID) {
- op->func = enabled ? ftrace_pid_func :
- op->saved_func;
+ op->func = ftrace_pids_enabled(op) ?
+ ftrace_pid_func : op->saved_func;
ftrace_update_trampoline(op);
}
} while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
return ops->func;
}
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *p;
- int cpu;
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- clear_tsk_trace_trace(p);
- }
- put_online_cpus();
-}
-
-static void set_ftrace_swapper(void)
-{
- struct task_struct *p;
- int cpu;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- set_tsk_trace_trace(p);
- }
- put_online_cpus();
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, next));
}
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
{
- struct task_struct *p;
+ struct trace_pid_list *pid_list;
+ int cpu;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- clear_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+ if (!pid_list)
+ return;
- put_pid(pid);
-}
+ unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
-static void set_ftrace_pid(struct pid *pid)
-{
- struct task_struct *p;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- set_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
-}
+ rcu_assign_pointer(tr->function_pids, NULL);
-static void clear_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- clear_ftrace_swapper();
- else
- clear_ftrace_pid(pid);
-}
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
-static void set_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- set_ftrace_swapper();
- else
- set_ftrace_pid(pid);
+ trace_free_pid_list(pid_list);
}
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
{
- struct pid *pid;
- struct ftrace_pid *fpid;
- int ret = -EINVAL;
-
mutex_lock(&ftrace_lock);
-
- if (!p)
- pid = ftrace_swapper_pid;
- else
- pid = find_get_pid(p);
-
- if (!pid)
- goto out;
-
- ret = 0;
-
- list_for_each_entry(fpid, &ftrace_pids, list)
- if (fpid->pid == pid)
- goto out_put;
-
- ret = -ENOMEM;
-
- fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
- if (!fpid)
- goto out_put;
-
- list_add(&fpid->list, &ftrace_pids);
- fpid->pid = pid;
-
- set_ftrace_pid_task(pid);
+ clear_ftrace_pids(tr);
ftrace_update_pid_func();
-
ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
- return 0;
-
-out_put:
- if (pid != ftrace_swapper_pid)
- put_pid(pid);
-
-out:
- mutex_unlock(&ftrace_lock);
- return ret;
}
-static void ftrace_pid_reset(void)
-{
- struct ftrace_pid *fpid, *safe;
-
- mutex_lock(&ftrace_lock);
- list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
- struct pid *pid = fpid->pid;
-
- clear_ftrace_pid_task(pid);
-
- list_del(&fpid->list);
- kfree(fpid);
- }
-
- ftrace_update_pid_func();
- ftrace_startup_all(0);
-
- mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1)
static void *fpid_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
mutex_lock(&ftrace_lock);
+ rcu_read_lock_sched();
- if (!ftrace_pids_enabled() && (!*pos))
- return (void *) 1;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- return seq_list_start(&ftrace_pids, *pos);
+ if (!pid_list)
+ return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+ return trace_pid_start(pid_list, pos);
}
static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
{
- if (v == (void *)1)
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+ if (v == FTRACE_NO_PIDS)
return NULL;
- return seq_list_next(v, &ftrace_pids, pos);
+ return trace_pid_next(pid_list, v, pos);
}
static void fpid_stop(struct seq_file *m, void *p)
+ __releases(RCU)
{
+ rcu_read_unlock_sched();
mutex_unlock(&ftrace_lock);
}
static int fpid_show(struct seq_file *m, void *v)
{
- const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
- if (v == (void *)1) {
+ if (v == FTRACE_NO_PIDS) {
seq_puts(m, "no pid\n");
return 0;
}
- if (fpid->pid == ftrace_swapper_pid)
- seq_puts(m, "swapper tasks\n");
- else
- seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
- return 0;
+ return trace_pid_show(m, v);
}
static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
static int
ftrace_pid_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_pid_reset();
+ ftrace_pid_reset(tr);
- if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &ftrace_pid_sops);
+ ret = seq_open(file, &ftrace_pid_sops);
+ if (ret < 0) {
+ trace_array_put(tr);
+ } else {
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = tr;
+ }
return ret;
}
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ mutex_is_locked(&ftrace_lock));
+
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, current));
+}
+
static ssize_t
ftrace_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[64], *tmp;
- long val;
- int ret;
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list;
+ ssize_t ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
+ if (!cnt)
+ return 0;
+
+ mutex_lock(&ftrace_lock);
+
+ filtered_pids = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
+ goto out;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ rcu_assign_pointer(tr->function_pids, pid_list);
- buf[cnt] = 0;
+ if (filtered_pids) {
+ synchronize_sched();
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
+ /* Register a probe to set whether to ignore the tracing of a task */
+ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ }
/*
- * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
- * to clean the filter quietly.
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
*/
- tmp = strstrip(buf);
- if (strlen(tmp) == 0)
- return 1;
+ on_each_cpu(ignore_task_cpu, tr, 1);
- ret = kstrtol(tmp, 10, &val);
- if (ret < 0)
- return ret;
+ ftrace_update_pid_func();
+ ftrace_startup_all(0);
+ out:
+ mutex_unlock(&ftrace_lock);
- ret = ftrace_pid_add(val);
+ if (ret > 0)
+ *ppos += ret;
- return ret ? ret : cnt;
+ return ret;
}
static int
ftrace_pid_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
+ struct trace_array *tr = inode->i_private;
- return 0;
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
}
static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct dentry *d_tracer;
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ tr, &ftrace_pid_fops);
+}
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ /* Only the top level directory has the dyn_tracefs and profile */
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
ftrace_init_dyn_tracefs(d_tracer);
-
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
-
ftrace_profile_tracefs(d_tracer);
-
- return 0;
}
-fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 77eeab277..7bc56762c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+ vfree(pid_list->pids);
+ kfree(pid_list);
+}
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ /*
+ * If pid_max changed after filtered_pids was created, we
+ * by default ignore all pids greater than the previous pid_max.
+ */
+ if (search_pid >= filtered_pids->pid_max)
+ return false;
+
+ return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
+ }
+
+ /* Sorry, but we don't support pid_max changing after setting */
+ if (task->pid >= pid_list->pid_max)
+ return;
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ set_bit(task->pid, pid_list->pids);
+ else
+ clear_bit(task->pid, pid_list->pids);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ unsigned long pid = (unsigned long)v;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual prevous bit */
+ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+ /* Return pid + 1 to allow zero to be represented */
+ if (pid < pid_list->pid_max)
+ return (void *)(pid + 1);
+
+ return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ loff_t l = 0;
+
+ pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+ if (pid >= pid_list->pid_max)
+ return NULL;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
+ return 0;
+}
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return -ENOMEM;
+
+ pid_list->pid_max = READ_ONCE(pid_max);
+
+ /* Only truncating will shrink pid_max */
+ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+ pid_list->pid_max = filtered_pids->pid_max;
+
+ pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+ if (!pid_list->pids) {
+ kfree(pid_list);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ for_each_set_bit(pid, filtered_pids->pids,
+ filtered_pids->pid_max) {
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+ }
+ }
+
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val >= pid_list->pid_max)
+ break;
+
+ pid = (pid_t)val;
+
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_free_pid_list(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_free_pid_list(pid_list);
+ read = ret;
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
+
static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+ /*
+ * If regs is not set, then skip the following callers:
+ * trace_buffer_unlock_commit_regs
+ * event_trigger_unlock_commit
+ * trace_event_buffer_commit
+ * trace_event_raw_event_sched_switch
+ * Note, we can still get here via blktrace, wakeup tracer
+ * and mmiotrace, but that's ok if they lose a function or
+ * two. They are that meaningful.
+ */
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
+ * Add two, for this function and the call to save_stack_trace()
+ * If regs is set, then these functions will not be in the way.
+ */
+ if (!regs)
+ trace.skip += 2;
+
+ /*
* Since events can happen in NMIs there's no safe way to
* use the per cpu ftrace_stacks. We reserve it and if an interrupt
* or NMI comes in, it will just have to use the default
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
/* created for use with alloc_percpu */
struct trace_buffer_struct {
- char buffer[TRACE_BUF_SIZE];
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
};
static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
*/
static char *get_trace_buf(void)
{
- struct trace_buffer_struct *percpu_buffer;
-
- /*
- * If we have allocated per cpu buffers, then we do not
- * need to do any locking.
- */
- if (in_nmi())
- percpu_buffer = trace_percpu_nmi_buffer;
- else if (in_irq())
- percpu_buffer = trace_percpu_irq_buffer;
- else if (in_softirq())
- percpu_buffer = trace_percpu_sirq_buffer;
- else
- percpu_buffer = trace_percpu_buffer;
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!percpu_buffer)
+ if (!buffer || buffer->nesting >= 4)
return NULL;
- return this_cpu_ptr(&percpu_buffer->buffer[0]);
+ return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+ this_cpu_dec(trace_percpu_buffer->nesting);
}
static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
- struct trace_buffer_struct *sirq_buffers;
- struct trace_buffer_struct *irq_buffers;
- struct trace_buffer_struct *nmi_buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (!buffers)
- goto err_warn;
-
- sirq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!sirq_buffers)
- goto err_sirq;
-
- irq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!irq_buffers)
- goto err_irq;
-
- nmi_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!nmi_buffers)
- goto err_nmi;
+ if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
trace_percpu_buffer = buffers;
- trace_percpu_sirq_buffer = sirq_buffers;
- trace_percpu_irq_buffer = irq_buffers;
- trace_percpu_nmi_buffer = nmi_buffers;
-
return 0;
-
- err_nmi:
- free_percpu(irq_buffers);
- err_irq:
- free_percpu(sirq_buffers);
- err_sirq:
- free_percpu(buffers);
- err_warn:
- WARN(1, "Could not allocate percpu trace_printk buffer");
- return -ENOMEM;
}
static int buffers_allocated;
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
- out:
+
+out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -6980,6 +7214,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
for_each_tracing_cpu(cpu)
tracing_init_tracefs_percpu(tr, cpu);
+ ftrace_init_tracefs(tr, d_tracer);
}
static struct vfsmount *trace_automount(void *ingore)
@@ -7133,6 +7368,7 @@ static __init int tracer_init_tracefs(void)
return 0;
init_tracer_tracefs(&global_trace, d_tracer);
+ ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d..f783df416 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
+ filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter) __packed
+
#include "trace_entries.h"
/*
@@ -156,6 +162,9 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
bool ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+ bool ftrace_ignore_pid;
+#endif
};
struct tracer;
@@ -247,6 +256,7 @@ struct trace_array {
int ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
+ struct trace_pid_list __rcu *function_pids;
/* function tracing enabled */
int function_enabled;
#endif
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+/* PID filtering */
+
+extern int pid_max;
+
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+ pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt);
+
#ifdef CONFIG_TRACER_MAX_TRACE
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
- if (list_empty(&ftrace_pids))
- return 1;
-
- return test_tsk_trace_trace(task);
+ return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
return 1;
}
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a48..5c30efcda 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892..03c0a48c3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
+ /*
+ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we need to subtract one to offset the increment.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ fbuffer->pc--;
fbuffer->trace_file = trace_file;
fbuffer->event =
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
-/* Shouldn't this be in a header? */
-extern int pid_max;
-
-/* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
-}
-
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
- /*
- * Return false, because if filtered_pids does not exist,
- * all pids are good to trace.
- */
- if (!filtered_pids)
- return false;
-
- return !find_filtered_pid(filtered_pids, task->pid);
-}
-
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
- struct task_struct *self,
- struct task_struct *task)
-{
- if (!pid_list)
- return;
-
- /* For forks, we only add if the forking task is listed */
- if (self) {
- if (!find_filtered_pid(pid_list, self->pid))
- return;
- }
-
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
- /* "self" is set for forks, and NULL for exits */
- if (self)
- set_bit(task->pid, pid_list->pids);
- else
- clear_bit(task->pid, pid_list->pids);
-}
-
static void
event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
{
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, NULL, task);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
static void
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, self, task);
+ trace_filter_add_remove_task(pid_list, self, task);
}
void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, prev) &&
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, prev) &&
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, task));
+ trace_ignore_this_task(pid_list, task));
}
static void
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
/* Set tracing if current is enabled */
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
- vfree(pid_list->pids);
- kfree(pid_list);
+ trace_free_pid_list(pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
{
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
- unsigned long pid = (unsigned long)v;
-
- (*pos)++;
-
- /* pid already is +1 of the actual prevous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
-
- return NULL;
+ return trace_pid_next(pid_list, v, pos);
}
static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
{
struct trace_pid_list *pid_list;
struct trace_array *tr = m->private;
- unsigned long pid;
- loff_t l = 0;
/*
* Grab the mutex, to keep calls to p_next() having the same
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
if (!pid_list)
return NULL;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
- return NULL;
-
- /* Return pid + 1 so that zero can be the exit value */
- for (pid++; pid && l < *pos;
- pid = (unsigned long)p_next(m, (void *)pid, &l))
- ;
- return (void *)pid;
+ return trace_pid_start(pid_list, pos);
}
static void p_stop(struct seq_file *m, void *p)
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
-static int p_show(struct seq_file *m, void *v)
-{
- unsigned long pid = (unsigned long)v - 1;
-
- seq_printf(m, "%lu\n", pid);
- return 0;
-}
-
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)
mutex_is_locked(&event_mutex));
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static ssize_t
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
struct trace_pid_list *filtered_pids = NULL;
struct trace_pid_list *pid_list;
struct trace_event_file *file;
- struct trace_parser parser;
- unsigned long val;
- loff_t this_pos;
- ssize_t read = 0;
- ssize_t ret = 0;
- pid_t pid;
- int nr_pids = 0;
+ ssize_t ret;
if (!cnt)
return 0;
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
- return -ENOMEM;
-
mutex_lock(&event_mutex);
+
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
- /*
- * Always recreate a new array. The write is an all or nothing
- * operation. Always create a new array when adding new pids by
- * the user. If the operation fails, then the current list is
- * not modified.
- */
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list) {
- read = -ENOMEM;
- goto out;
- }
- pid_list->pid_max = READ_ONCE(pid_max);
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- kfree(pid_list);
- read = -ENOMEM;
- goto out;
- }
- if (filtered_pids) {
- /* copy the current bits to the new max */
- pid = find_first_bit(filtered_pids->pids,
- filtered_pids->pid_max);
- while (pid < filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
- pid = find_next_bit(filtered_pids->pids,
- filtered_pids->pid_max,
- pid + 1);
- nr_pids++;
- }
- }
-
- while (cnt > 0) {
-
- this_pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- parser.buffer[parser.idx] = 0;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
- if (val >= pid_list->pid_max)
- break;
-
- pid = (pid_t)val;
-
- set_bit(pid, pid_list->pids);
- nr_pids++;
-
- trace_parser_clear(&parser);
- ret = 0;
- }
- trace_parser_put(&parser);
-
- if (ret < 0) {
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
goto out;
- }
- if (!nr_pids) {
- /* Cleared the list of pids */
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
- if (!filtered_pids)
- goto out;
- pid_list = NULL;
- }
rcu_assign_pointer(tr->filtered_pids, pid_list);
list_for_each_entry(file, &tr->events, list) {
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_sched();
-
- vfree(filtered_pids->pids);
- kfree(filtered_pids);
- } else {
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
/*
* Register a probe that is called before all other probes
* to set ignore_pid if next or prev do not match.
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
out:
mutex_unlock(&event_mutex);
- ret = read;
- if (read > 0)
- *ppos += read;
+ if (ret > 0)
+ *ppos += ret;
return ret;
}
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
static const struct seq_operations show_set_pid_seq_ops = {
.start = p_start,
.next = p_next,
- .show = p_show,
+ .show = trace_pid_show,
.stop = p_stop,
};
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c05b8a99..f3a960ed7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1441,6 +1441,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
+ if (hist_data->attrs->pause)
+ data->paused = true;
+
if (named_data) {
destroy_hist_data(data->private_data);
data->private_data = named_data->private_data;
@@ -1448,9 +1451,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
data->ops = &event_hist_trigger_named_ops;
}
- if (hist_data->attrs->pause)
- data->paused = true;
-
if (data->ops->init) {
ret = data->ops->init(data->ops, data);
if (ret < 0)
@@ -1500,9 +1500,9 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
static void hist_unreg_all(struct trace_event_file *file)
{
- struct event_trigger_data *test;
+ struct event_trigger_data *test, *n;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
@@ -1699,9 +1699,9 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
static void hist_enable_unreg_all(struct trace_event_file *file)
{
- struct event_trigger_data *test;
+ struct event_trigger_data *test, *n;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
list_del_rcu(&test->list);
update_cond_flag(file);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4..0efa00d80 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
/* Currently only the non stack verision is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7..7363ccf79 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int cpu;
int pc;
- if (!ftrace_trace_task(current))
+ if (!ftrace_trace_task(tr))
return 0;
/* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
+ /*
+ * Stop here if tracing_threshold is set. We only write function return
+ * events to the ring buffer.
+ */
+ if (tracing_thresh)
+ return 1;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
- if (tracing_thresh)
- return 1;
- else
- return trace_graph_entry(trace);
-}
-
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
set_graph_array(tr);
if (tracing_thresh)
ret = register_ftrace_graph(&trace_graph_thresh_return,
- &trace_graph_thresh_entry);
+ &trace_graph_entry);
else
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec05..9aedb0b06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
* $retval : fetch return value
* $stack : fetch stack address
* $stackN : fetch Nth of stack (N:0-)
+ * $comm : fetch current task comm
* @ADDR : fetch memory at ADDR (ADDR should be in kernel)
* @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
* %REG : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6..cd7480d0a 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
dev->bus->number, dev->devfn,
dev->vendor, dev->device, dev->irq);
- /*
- * XXX: is pci_resource_to_user() appropriate, since we are
- * supposed to interpret the __ioremap() phys_addr argument based on
- * these printed values?
- */
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
+ end = dev->resource[i].end;
trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6f..74e80a582 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
kfree(data);
}
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ long ret;
+
+ if (!maxlen)
+ return;
+
+ ret = strlcpy(dst, current->comm, maxlen);
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ *(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
static const struct fetch_type *find_fetch_type(const char *type,
const struct fetch_type *ftbl)
{
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
ret = -EINVAL;
+ } else if (strcmp(arg, "comm") == 0) {
+ if (strcmp(t->name, "string") != 0 &&
+ strcmp(t->name, "string_size") != 0)
+ return -EINVAL;
+ f->fn = t->fetch[FETCH_MTD_comm];
} else
ret = -EINVAL;
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
arg[t - parg->comm] = '\0';
t++;
}
+ /*
+ * The default type of $comm should be "string", and it can't be
+ * dereferenced.
+ */
+ if (!t && strcmp(arg, "$comm") == 0)
+ t = "string";
parg->type = find_fetch_type(t, ftbl);
if (!parg->type) {
pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09..45400ca5d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
FETCH_MTD_reg = 0,
FETCH_MTD_stack,
FETCH_MTD_retval,
+ FETCH_MTD_comm,
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
+/* comm only makes sense as a string */
+#define fetch_comm_u8 NULL
+#define fetch_comm_u16 NULL
+#define fetch_comm_u32 NULL
+#define fetch_comm_u64 NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
/*
* Define macro for basic types - we don't need to define s* types, because
* we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
ASSIGN_FETCH_FUNC(reg, ftype), \
ASSIGN_FETCH_FUNC(stack, ftype), \
ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(comm, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc2119..68f594212 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
return allowed;
}
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+ struct user_namespace *ns;
+ for (ns = current_user_ns(); ns; ns = ns->parent) {
+ if (ns == target_ns)
+ return true;
+ }
+ return false;
+}
+
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 97e7b793d..ef071ca73 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
/**
* show_workqueue_state - dump workqueue state
*
- * Called from a sysrq handler and prints out all busy workqueues and
- * pools.
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
*/
void show_workqueue_state(void)
{
@@ -4607,84 +4607,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_prepare_cpu(unsigned int cpu)
+{
+ struct worker_pool *pool;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (pool->nr_workers)
+ continue;
+ if (!create_worker(pool))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int workqueue_online_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- for_each_cpu_worker_pool(pool, cpu) {
- if (pool->nr_workers)
- continue;
- if (!create_worker(pool))
- return NOTIFY_BAD;
- }
- break;
-
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- mutex_lock(&wq_pool_mutex);
+ mutex_lock(&wq_pool_mutex);
- for_each_pool(pool, pi) {
- mutex_lock(&pool->attach_mutex);
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu)
- rebind_workers(pool);
- else if (pool->cpu < 0)
- restore_unbound_workers_cpumask(pool, cpu);
+ if (pool->cpu == cpu)
+ rebind_workers(pool);
+ else if (pool->cpu < 0)
+ restore_unbound_workers_cpumask(pool, cpu);
- mutex_unlock(&pool->attach_mutex);
- }
+ mutex_unlock(&pool->attach_mutex);
+ }
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
- mutex_unlock(&wq_pool_mutex);
- break;
- }
- return NOTIFY_OK;
+ mutex_unlock(&wq_pool_mutex);
+ return 0;
}
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_offline_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct work_struct unbind_work;
struct workqueue_struct *wq;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- /* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
-
- /* update NUMA affinity of unbound workqueues */
- mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
- mutex_unlock(&wq_pool_mutex);
-
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
- break;
- }
- return NOTIFY_OK;
+ /* unbinding per-cpu workers should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
+ flush_work(&unbind_work);
+ destroy_work_on_stack(&unbind_work);
+ return 0;
}
#ifdef CONFIG_SMP
@@ -5486,9 +5467,6 @@ static int __init init_workqueues(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
- hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-
wq_numa_init();
/* initialize CPU pools */