diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-12-15 14:52:16 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-12-15 14:52:16 -0300 |
commit | 8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be (patch) | |
tree | e9891aa6c295060d065adffd610c4f49ecf884f3 /fs/proc | |
parent | a71852147516bc1cb5b0b3cbd13639bfd4022dc8 (diff) |
Linux-libre 4.3.2-gnu
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/array.c | 21 | ||||
-rw-r--r-- | fs/proc/base.c | 124 | ||||
-rw-r--r-- | fs/proc/generic.c | 44 | ||||
-rw-r--r-- | fs/proc/page.c | 65 | ||||
-rw-r--r-- | fs/proc/root.c | 2 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 292 |
6 files changed, 295 insertions, 253 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf31..eed2050db 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); @@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) @@ -372,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { - unsigned long vsize, eip, esp, wchan = ~0UL; + unsigned long vsize, eip, esp, wchan = 0; int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; @@ -504,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', wchan); + + /* + * We used to output the absolute kernel address, but that's an + * information leak - so instead we show a 0/1 flag here, to signal + * to user-space whether there's a wchan field in /proc/PID/wchan. + * + * This works with older implementations of procps as well. + */ + if (wchan) + seq_puts(m, " 1"); + else + seq_puts(m, " 0"); + seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ll(m, ' ', task->exit_signal); diff --git a/fs/proc/base.c b/fs/proc/base.c index f2193218a..ac5498d9b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) { - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return 0; - seq_printf(m, "%lu", wchan); - } else { + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); - } + else + seq_putc(m, '0'); return 0; } @@ -507,7 +504,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", - (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)tsk_seruntime(task), (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -1230,10 +1227,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page, *tmp; - ssize_t length; uid_t loginuid; kuid_t kloginuid; + int rv; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1242,46 +1238,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, } rcu_read_unlock(); - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free_page; - page[count] = '\0'; - loginuid = simple_strtoul(page, &tmp, 10); - if (tmp == page) { - length = -EINVAL; - goto out_free_page; - - } + rv = kstrtou32_from_user(buf, count, 10, &loginuid); + if (rv < 0) + return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; - } + if (!uid_valid(kloginuid)) + return -EINVAL; } - length = audit_set_loginuid(kloginuid); - if (likely(length == 0)) - length = count; - -out_free_page: - free_page((unsigned long) page); - return length; + rv = audit_set_loginuid(kloginuid); + if (rv < 0) + return rv; + return count; } static const struct file_operations proc_loginuid_operations = { @@ -1335,8 +1313,9 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; + char buffer[PROC_NUMBUF]; int make_it_fail; + int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -1345,9 +1324,9 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); + if (rv < 0) + return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; @@ -1836,8 +1815,6 @@ end_instantiate: return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } -#ifdef CONFIG_CHECKPOINT_RESTORE - /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. @@ -1864,11 +1841,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (!capable(CAP_SYS_ADMIN)) { - status = -EPERM; - goto out_notask; - } - inode = d_inode(dentry); task = get_proc_task(inode); if (!task) @@ -1957,6 +1929,29 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; +/* + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the + * symlinks may be used to bypass permissions on ancestor directories in the + * path to the file in question. + */ +static const char * +proc_map_files_follow_link(struct dentry *dentry, void **cookie) +{ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return proc_pid_follow_link(dentry, NULL); +} + +/* + * Identical to proc_pid_link_inode_operations except for follow_link() + */ +static const struct inode_operations proc_map_files_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_map_files_follow_link, + .setattr = proc_setattr, +}; + static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -1972,7 +1967,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; inode->i_mode = S_IFLNK; @@ -1996,10 +1991,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, int result; struct mm_struct *mm; - result = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - result = -ENOENT; task = get_proc_task(dir); if (!task) @@ -2053,10 +2044,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct map_files_info *p; int ret; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2245,7 +2232,6 @@ static const struct file_operations proc_timers_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -2481,32 +2467,20 @@ static ssize_t proc_coredump_filter_write(struct file *file, { struct task_struct *task; struct mm_struct *mm; - char buffer[PROC_NUMBUF], *end; unsigned int val; int ret; int i; unsigned long mask; - ret = -EFAULT; - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - goto out_no_task; - - ret = -EINVAL; - val = (unsigned int)simple_strtoul(buffer, &end, 0); - if (*end == '\n') - end++; - if (end - buffer == 0) - goto out_no_task; + ret = kstrtouint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; - ret = end - buffer; mm = get_task_mm(task); if (!mm) goto out_no_mm; @@ -2522,7 +2496,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, out_no_mm: put_task_struct(task); out_no_task: - return ret; + if (ret < 0) + return ret; + return count; } static const struct file_operations proc_coredump_filter_operations = { @@ -2744,9 +2720,7 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), -#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e5dee5c31..ff3ffc76a 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -26,7 +26,7 @@ #include "internal.h" -static DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_RWLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, { int rv; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); rv = __xlate_proc_name(name, ret, residual); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return rv; } @@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, { struct inode *inode; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); if (de) { pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); @@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, d_add(dentry, inode); return NULL; } - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); } @@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 0; } if (!i) @@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, do { struct proc_dir_entry *next; pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) { pde_put(de); return 0; } - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); ctx->pos++; next = pde_subdir_next(de); pde_put(de); de = next; } while (de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 1; } @@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (ret) return ret; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_free_inum(dp->low_ino); return -EEXIST; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return 0; } @@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return; } len = strlen(fn); @@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) rb_erase(&de->subdir_node, &parent->subdir); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); return; @@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } len = strlen(fn); root = pde_subdir_find(parent, fn, len); if (!root) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } rb_erase(&root->subdir_node, &parent->subdir); @@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_entry_rundown(de); next = de->parent; @@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) break; pde_put(de); - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); de = next; } pde_put(root); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7eee2d8b9..93484034a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -9,12 +9,16 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * @@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (page_is_idle(page)) + u |= 1 << KPF_IDLE; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); @@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); +#ifdef CONFIG_MEMCG + proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); +#endif return 0; } fs_initcall(proc_page_init); diff --git a/fs/proc/root.c b/fs/proc/root.c index 68feb0f70..361ab4ee4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; + /* User space would break if executables appear on proc */ + sb->s_iflags |= SB_I_NOEXEC; } return dget(sb->s_root); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8940e47e3..5e7e631af 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -449,6 +450,7 @@ struct mem_size_stats { unsigned long anonymous_thp; unsigned long swap; u64 pss; + u64 swap_pss; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -461,7 +463,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || PageReferenced(page)) + if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; mapcount = page_mapcount(page); if (mapcount >= 2) { @@ -495,9 +497,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (!non_swap_entry(swpent)) + if (!non_swap_entry(swpent)) { + int mapcount; + mss->swap += PAGE_SIZE; - else if (is_migration_entry(swpent)) + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); } @@ -600,6 +613,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", }; size_t i; @@ -641,6 +656,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" "Locked: %8lu kB\n", @@ -655,6 +671,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.anonymous >> 10, mss.anonymous_thp >> 10, mss.swap >> 10, + (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, (vma->vm_flags & VM_LOCKED) ? @@ -713,23 +730,6 @@ const struct file_operations proc_tid_smaps_operations = { .release = proc_map_release, }; -/* - * We do not want to have constant page-shift bits sitting in - * pagemap entries and are about to reuse them some time soon. - * - * Here's the "migration strategy": - * 1. when the system boots these bits remain what they are, - * but a warning about future change is printed in log; - * 2. once anyone clears soft-dirty bits via clear_refs file, - * these flag is set to denote, that user is aware of the - * new API and those page-shift bits change their meaning. - * The respective warning is printed in dmesg; - * 3. In a couple of releases we will remove all the mentions - * of page-shift in pagemap entries. - */ - -static bool soft_dirty_cleared __read_mostly; - enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -811,6 +811,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); ClearPageReferenced(page); out: spin_unlock(ptl); @@ -838,6 +839,7 @@ out: /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); ClearPageReferenced(page); } pte_unmap_unlock(pte - 1, ptl); @@ -890,13 +892,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - if (type == CLEAR_REFS_SOFT_DIRTY) { - soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning!" - " See the linux/Documentation/vm/pagemap.txt for " - "details.\n"); - } - task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -964,36 +959,26 @@ typedef struct { struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; - bool v2; + bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) -#define PM_STATUS_BITS 3 -#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) -#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) -#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) -#define PM_PSHIFT_BITS 6 -#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) -#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) -#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -/* in "new" pagemap pshift bits are occupied with more status bits */ -#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) - -#define __PM_SOFT_DIRTY (1LL) -#define PM_PRESENT PM_STATUS(4LL) -#define PM_SWAP PM_STATUS(2LL) -#define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + #define PM_END_OF_BUFFER 1 -static inline pagemap_entry_t make_pme(u64 val) +static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { - return (pagemap_entry_t) { .pme = val }; + return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, @@ -1014,7 +999,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; @@ -1034,7 +1019,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) - pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); if (err) @@ -1045,67 +1030,42 @@ out: return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - u64 frame, flags; + u64 frame = 0, flags = 0; struct page *page = NULL; - int flags2 = 0; if (pte_present(pte)) { - frame = pte_pfn(pte); - flags = PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte); + flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); - flags = PM_SWAP; + flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); - } else { - if (vma->vm_flags & VM_SOFTDIRTY) - flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); - return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY)) - flags2 |= __PM_SOFT_DIRTY; - - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -} + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ - /* - * Currently pmd for thp is always present because thp can not be - * swapped-out, migrated, or HWPOISONed (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) - *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); + return make_pme(frame, flags); } -#else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ -} -#endif -static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; @@ -1114,41 +1074,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte, *orig_pte; int err = 0; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - int pmd_flags2; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) - pmd_flags2 = __PM_SOFT_DIRTY; - else - pmd_flags2 = 0; + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + + /* + * Currently pmd for thp is always present because thp + * can not be swapped-out, migrated, or HWPOISONed + * (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) { + struct page *page = pmd_page(pmd); + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - unsigned long offset; - pagemap_entry_t pme; + pagemap_entry_t pme = make_pme(frame, flags); - offset = (addr & ~PAGEMAP_WALK_MASK) >> - PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } spin_unlock(ptl); return err; } - if (pmd_trans_unstable(pmd)) + if (pmd_trans_unstable(pmdp)) return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1161,40 +1138,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset, int flags2) -{ - if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | - PM_STATUS2(pm->v2, flags2) | - PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | - PM_STATUS2(pm->v2, flags2)); -} - /* This function walks within one hugetlb entry in the single call */ -static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, +static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; + u64 flags = 0, frame = 0; int err = 0; - int flags2; - pagemap_entry_t pme; + pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; + flags |= PM_SOFT_DIRTY; + + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + + if (!PageAnon(page)) + flags |= PM_FILE; + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte) + + ((addr & ~hmask) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + pagemap_entry_t pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } cond_resched(); @@ -1212,7 +1193,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1230,42 +1213,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; struct pagemapread pm; - int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; - int copied = 0; + int ret = 0, copied = 0; - if (!task) + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) - goto out_task; + goto out_mm; ret = 0; if (!count) - goto out_task; + goto out_mm; + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); - pm.v2 = soft_dirty_cleared; pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_task; - - mm = mm_access(task, PTRACE_MODE_READ); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_free; + goto out_mm; - pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pmd_entry = pagemap_pmd_range; pagemap_walk.pte_hole = pagemap_pte_hole; #ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; @@ -1276,10 +1254,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; - end_vaddr = TASK_SIZE_OF(task); + end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + if (svpfn > mm->task_size >> PAGE_SHIFT) start_vaddr = end_vaddr; /* @@ -1306,7 +1284,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_mm; + goto out_free; } copied += len; buf += len; @@ -1316,24 +1294,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_mm: - mmput(mm); out_free: kfree(pm.buffer); -out_task: - put_task_struct(task); +out_mm: + mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { - /* do not disclose physical addresses: attack vector */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " - "to stop being page-shift some time soon. See the " - "linux/Documentation/vm/pagemap.txt for details.\n"); + struct mm_struct *mm; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + file->private_data = mm; + return 0; +} + +static int pagemap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); return 0; } @@ -1341,6 +1326,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, + .release = pagemap_release, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ |