diff options
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r-- | mm/oom_kill.c | 172 |
1 files changed, 123 insertions, 49 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86349586e..ddf74487f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; + /* + * Do not even consider tasks which are explicitly marked oom + * unkillable or have been already oom reaped. + */ adj = (long)p->signal->oom_score_adj; - if (adj == OOM_SCORE_ADJ_MIN) { + if (adj == OOM_SCORE_ADJ_MIN || + test_bit(MMF_OOM_REAPED, &p->mm->flags)) { task_unlock(p); return 0; } @@ -278,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ - if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - } - if (!task->mm) - return OOM_SCAN_CONTINUE; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) + return OOM_SCAN_ABORT; /* * If task is allocating a lot of memory and has been marked to be @@ -302,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, static struct task_struct *select_bad_process(struct oom_control *oc, unsigned int *ppoints, unsigned long totalpages) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; unsigned long chosen_points = 0; rcu_read_lock(); - for_each_process_thread(g, p) { + for_each_process(p) { unsigned int points; switch (oom_scan_process_thread(oc, p, totalpages)) { @@ -326,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc, points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && thread_group_leader(chosen)) - continue; chosen = p; chosen_points = points; @@ -412,6 +410,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -422,18 +439,33 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; /* + * We have to make sure to not race with the victim exit path + * and cause premature new oom victim selection: + * __oom_reap_task exit_mm + * atomic_inc_not_zero + * mmput + * atomic_dec_and_test + * exit_oom_victim + * [...] + * out_of_memory + * select_bad_process + * # no TIF_MEMDIE task selects new victim + * unmap_page_range # frees some memory + */ + mutex_lock(&oom_lock); + + /* * Make sure we find the associated mm_struct even when the particular * thread has already terminated and cleared its mm. * We might have race with exit path so consider our work done if there @@ -441,19 +473,14 @@ static bool __oom_reap_task(struct task_struct *tsk) */ p = find_lock_task_mm(tsk); if (!p) - return true; - + goto unlock_oom; mm = p->mm; - if (!atomic_inc_not_zero(&mm->mm_users)) { - task_unlock(p); - return true; - } - + atomic_inc(&mm->mm_users); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto out; + goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -491,16 +518,19 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore. OOM killer can continue - * by selecting other victim if unmapping hasn't led to any - * improvements. This also means that selecting this task doesn't - * make any sense. + * This task can be safely ignored because we cannot do much more + * to release its memory. */ - tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; - exit_oom_victim(tsk); -out: - mmput(mm); + set_bit(MMF_OOM_REAPED, &mm->flags); +unlock_oom: + mutex_unlock(&oom_lock); + /* + * Drop our reference but make sure the mmput slow path is called from a + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ + if (mm) + mmput_async(mm); return ret; } @@ -519,6 +549,15 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); } + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore or it is not a good candidate + * for the oom victim right now because it cannot release its memory + * itself nor by the oom reaper. + */ + tsk->oom_reaper_list = NULL; + exit_oom_victim(tsk); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } @@ -563,6 +602,46 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + if (signal_group_exit(p->signal)) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -593,6 +672,7 @@ void mark_oom_victim(struct task_struct *tsk) /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -610,6 +690,7 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -653,24 +734,6 @@ void oom_killer_enable(void) } /* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - -/* * Must be called while holding a reference to p, which will be released upon * returning. */ @@ -694,6 +757,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,10 +937,20 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. + */ + if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + return true; + + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ |