summaryrefslogtreecommitdiff
path: root/mm/oom_kill.c
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
commitd0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /mm/oom_kill.c
parente914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r--mm/oom_kill.c252
1 files changed, 162 insertions, 90 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ddf74487f..d53a9aa00 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
/*
* Do not even consider tasks which are explicitly marked oom
- * unkillable or have been already oom reaped.
+ * unkillable or have been already oom reaped or the are in
+ * the middle of vfork
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
+ test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+ in_vfork(p)) {
task_unlock(p);
return 0;
}
@@ -274,17 +276,29 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
#endif
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task, unsigned long totalpages)
+ struct task_struct *task)
{
if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
/*
* This task already has access to memory reserves and is being killed.
- * Don't allow any other task to have access to the reserves.
+ * Don't allow any other task to have access to the reserves unless
+ * the task has MMF_OOM_REAPED because chances that it would release
+ * any memory is quite low.
*/
- if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
- return OOM_SCAN_ABORT;
+ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
+ struct task_struct *p = find_lock_task_mm(task);
+ enum oom_scan_t ret = OOM_SCAN_ABORT;
+
+ if (p) {
+ if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
+ ret = OOM_SCAN_CONTINUE;
+ task_unlock(p);
+ }
+
+ return ret;
+ }
/*
* If task is allocating a lot of memory and has been marked to be
@@ -311,7 +325,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
for_each_process(p) {
unsigned int points;
- switch (oom_scan_process_thread(oc, p, totalpages)) {
+ switch (oom_scan_process_thread(oc, p)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -383,8 +397,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
-static void dump_header(struct oom_control *oc, struct task_struct *p,
- struct mem_cgroup *memcg)
+static void dump_header(struct oom_control *oc, struct task_struct *p)
{
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -392,12 +405,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
cpuset_print_current_mems_allowed();
dump_stack();
- if (memcg)
- mem_cgroup_print_oom_info(memcg, p);
+ if (oc->memcg)
+ mem_cgroup_print_oom_info(oc->memcg, p);
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, oc->nodemask);
+ dump_tasks(oc->memcg, oc->nodemask);
}
/*
@@ -416,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
{
struct task_struct *t;
@@ -453,7 +466,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
* __oom_reap_task exit_mm
- * atomic_inc_not_zero
+ * mmget_not_zero
* mmput
* atomic_dec_and_test
* exit_oom_victim
@@ -475,12 +488,22 @@ static bool __oom_reap_task(struct task_struct *tsk)
if (!p)
goto unlock_oom;
mm = p->mm;
- atomic_inc(&mm->mm_users);
+ atomic_inc(&mm->mm_count);
task_unlock(p);
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
- goto unlock_oom;
+ goto mm_drop;
+ }
+
+ /*
+ * increase mm_users only after we know we will reap something so
+ * that the mmput_async is called only when we have reaped something
+ * and delayed __mmput doesn't matter that much
+ */
+ if (!mmget_not_zero(mm)) {
+ up_read(&mm->mmap_sem);
+ goto mm_drop;
}
tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -522,15 +545,16 @@ static bool __oom_reap_task(struct task_struct *tsk)
* to release its memory.
*/
set_bit(MMF_OOM_REAPED, &mm->flags);
-unlock_oom:
- mutex_unlock(&oom_lock);
/*
* Drop our reference but make sure the mmput slow path is called from a
* different context because we shouldn't risk we get stuck there and
* put the oom_reaper out of the way.
*/
- if (mm)
- mmput_async(mm);
+ mmput_async(mm);
+mm_drop:
+ mmdrop(mm);
+unlock_oom:
+ mutex_unlock(&oom_lock);
return ret;
}
@@ -544,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts > MAX_OOM_REAP_RETRIES) {
+ struct task_struct *p;
+
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
+
+ /*
+ * If we've already tried to reap this task in the past and
+ * failed it probably doesn't make much sense to try yet again
+ * so hide the mm from the oom killer so that it can move on
+ * to another task with a different mm struct.
+ */
+ p = find_lock_task_mm(tsk);
+ if (p) {
+ if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
+ pr_info("oom_reaper: giving up pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ set_bit(MMF_OOM_REAPED, &p->mm->flags);
+ }
+ task_unlock(p);
+ }
+
debug_show_all_locks();
}
@@ -584,7 +627,7 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+void wake_oom_reaper(struct task_struct *tsk)
{
if (!oom_reaper_th)
return;
@@ -602,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait);
}
-/* Check if we can reap the given task. This has to be called with stable
- * tsk->mm
- */
-void try_oom_reaper(struct task_struct *tsk)
-{
- struct mm_struct *mm = tsk->mm;
- struct task_struct *p;
-
- if (!mm)
- return;
-
- /*
- * There might be other threads/processes which are either not
- * dying or even not killable.
- */
- if (atomic_read(&mm->mm_users) > 1) {
- rcu_read_lock();
- for_each_process(p) {
- if (!process_shares_mm(p, mm))
- continue;
- if (fatal_signal_pending(p))
- continue;
-
- /*
- * If the task is exiting make sure the whole thread group
- * is exiting and cannot acces mm anymore.
- */
- if (signal_group_exit(p->signal))
- continue;
-
- /* Give up */
- rcu_read_unlock();
- return;
- }
- rcu_read_unlock();
- }
-
- wake_oom_reaper(tsk);
-}
-
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -653,10 +656,6 @@ static int __init oom_init(void)
return 0;
}
subsys_initcall(oom_init)
-#else
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-}
#endif
/**
@@ -733,13 +732,87 @@ void oom_killer_enable(void)
oom_killer_disabled = false;
}
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+
+ /*
+ * A coredumping process may sleep for an extended period in exit_mm(),
+ * so the oom killer cannot assume that the process will promptly exit
+ * and release memory.
+ */
+ if (sig->flags & SIGNAL_GROUP_COREDUMP)
+ return false;
+
+ if (sig->flags & SIGNAL_GROUP_EXIT)
+ return true;
+
+ if (thread_group_empty(task) && (task->flags & PF_EXITING))
+ return true;
+
+ return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ * Caller has to make sure that task->mm is stable (hold task_lock or
+ * it operates on the current).
+ */
+bool task_will_free_mem(struct task_struct *task)
+{
+ struct mm_struct *mm = task->mm;
+ struct task_struct *p;
+ bool ret = true;
+
+ /*
+ * Skip tasks without mm because it might have passed its exit_mm and
+ * exit_oom_victim. oom_reaper could have rescued that but do not rely
+ * on that for now. We can consider find_lock_task_mm in future.
+ */
+ if (!mm)
+ return false;
+
+ if (!__task_will_free_mem(task))
+ return false;
+
+ /*
+ * This task has already been drained by the oom reaper so there are
+ * only small chances it will free some more
+ */
+ if (test_bit(MMF_OOM_REAPED, &mm->flags))
+ return false;
+
+ if (atomic_read(&mm->mm_users) <= 1)
+ return true;
+
+ /*
+ * This is really pessimistic but we do not have any reliable way
+ * to check that external processes share with our mm
+ */
+ rcu_read_lock();
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(task, p))
+ continue;
+ ret = __task_will_free_mem(p);
+ if (!ret)
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, const char *message)
+ const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -755,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
task_lock(p);
- if (p->mm && task_will_free_mem(p)) {
+ if (task_will_free_mem(p)) {
mark_oom_victim(p);
- try_oom_reaper(p);
+ wake_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -765,7 +838,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(oc, p, memcg);
+ dump_header(oc, p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
@@ -786,8 +859,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, oc->nodemask,
- totalpages);
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
victim = child;
@@ -840,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
- p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
/*
* We cannot use oom_reaper for the mm shared by this
* process because it wouldn't get killed and so the
- * memory might be still used.
+ * memory might be still used. Hide the mm from the oom
+ * killer to guarantee OOM forward progress.
*/
can_oom_reap = false;
+ set_bit(MMF_OOM_REAPED, &mm->flags);
+ pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
+ task_pid_nr(victim), victim->comm,
+ task_pid_nr(p), p->comm);
continue;
}
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@@ -865,8 +942,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
- struct mem_cgroup *memcg)
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -882,7 +958,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
return;
- dump_header(oc, NULL, memcg);
+ dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -930,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
- *
- * But don't select if current has already released its mm and cleared
- * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/
- if (current->mm &&
- (fatal_signal_pending(current) || task_will_free_mem(current))) {
+ if (task_will_free_mem(current)) {
mark_oom_victim(current);
- try_oom_reaper(current);
+ wake_oom_reaper(current);
return true;
}
@@ -957,13 +1029,13 @@ bool out_of_memory(struct oom_control *oc)
constraint = constrained_alloc(oc, &totalpages);
if (constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
- check_panic_on_oom(oc, constraint, NULL);
+ check_panic_on_oom(oc, constraint);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(oc, current, 0, totalpages, NULL,
+ oom_kill_process(oc, current, 0, totalpages,
"Out of memory (oom_kill_allocating_task)");
return true;
}
@@ -971,12 +1043,11 @@ bool out_of_memory(struct oom_control *oc)
p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p && !is_sysrq_oom(oc)) {
- dump_header(oc, NULL, NULL);
+ dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
if (p && p != (void *)-1UL) {
- oom_kill_process(oc, p, points, totalpages, NULL,
- "Out of memory");
+ oom_kill_process(oc, p, points, totalpages, "Out of memory");
/*
* Give the killed process a good chance to exit before trying
* to allocate memory again.
@@ -988,14 +1059,15 @@ bool out_of_memory(struct oom_control *oc)
/*
* The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
- * parallel oom killing is already in progress so do nothing.
+ * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
+ * killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
+ .memcg = NULL,
.gfp_mask = 0,
.order = 0,
};