/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ /*** This file is part of systemd. Copyright (C) 2009-2013 Intel Corporation Authors: Auke Kok <auke-jan.h.kok@intel.com> systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. systemd is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ #include <unistd.h> #include <stdlib.h> #include <limits.h> #include <stdio.h> #include <string.h> #include <dirent.h> #include <fcntl.h> #include <time.h> #include "util.h" #include "time-util.h" #include "strxcpyx.h" #include "store.h" #include "bootchart.h" #include "cgroup-util.h" #include "fileio.h" /* * Alloc a static 4k buffer for stdio - primarily used to increase * PSS buffering from the default 1k stdin buffer to reduce * read() overhead. */ static char smaps_buf[4096]; static int skip = 0; double gettime_ns(void) { struct timespec n; clock_gettime(CLOCK_MONOTONIC, &n); return (n.tv_sec + (n.tv_nsec / (double) NSEC_PER_SEC)); } static char *bufgetline(char *buf) { char *c; if (!buf) return NULL; c = strchr(buf, '\n'); if (c) c++; return c; } static int pid_cmdline_strscpy(int procfd, char *buffer, size_t buf_len, int pid) { char filename[PATH_MAX]; _cleanup_close_ int fd = -1; ssize_t n; sprintf(filename, "%d/cmdline", pid); fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (fd < 0) return -errno; n = read(fd, buffer, buf_len-1); if (n > 0) { int i; for (i = 0; i < n; i++) if (buffer[i] == '\0') buffer[i] = ' '; buffer[n] = '\0'; } return 0; } int log_sample(DIR *proc, int sample, struct ps_struct *ps_first, struct list_sample_data **ptr, int *pscount, int *cpus) { static int vmstat = -1; _cleanup_free_ char *buf_schedstat = NULL; char buf[4096]; char key[256]; char val[256]; char rt[256]; char wt[256]; char *m; int r; int c; int p; int mod; static int e_fd = -1; ssize_t s; ssize_t n; struct dirent *ent; int fd; struct list_sample_data *sampledata; struct ps_sched_struct *ps_prev = NULL; int procfd; int taskfd = -1; sampledata = *ptr; procfd = dirfd(proc); if (procfd < 0) return -errno; if (vmstat < 0) { /* block stuff */ vmstat = openat(procfd, "vmstat", O_RDONLY|O_CLOEXEC); if (vmstat < 0) return log_error_errno(errno, "Failed to open /proc/vmstat: %m"); } n = pread(vmstat, buf, sizeof(buf) - 1, 0); if (n <= 0) { vmstat = safe_close(vmstat); if (n < 0) return -errno; return -ENODATA; } buf[n] = '\0'; m = buf; while (m) { if (sscanf(m, "%s %s", key, val) < 2) goto vmstat_next; if (streq(key, "pgpgin")) sampledata->blockstat.bi = atoi(val); if (streq(key, "pgpgout")) { sampledata->blockstat.bo = atoi(val); break; } vmstat_next: m = bufgetline(m); if (!m) break; } /* Parse "/proc/schedstat" for overall CPU utilization */ r = read_full_file("/proc/schedstat", &buf_schedstat, NULL); if (r < 0) return log_error_errno(r, "Unable to read schedstat: %m"); m = buf_schedstat; while (m) { if (sscanf(m, "%s %*s %*s %*s %*s %*s %*s %s %s", key, rt, wt) < 3) goto schedstat_next; if (strstr(key, "cpu")) { r = safe_atoi((const char*)(key+3), &c); if (r < 0 || c > MAXCPUS -1) /* Oops, we only have room for MAXCPUS data */ break; sampledata->runtime[c] = atoll(rt); sampledata->waittime[c] = atoll(wt); if (c == *cpus) *cpus = c + 1; } schedstat_next: m = bufgetline(m); if (!m) break; } if (arg_entropy) { if (e_fd < 0) { e_fd = openat(procfd, "sys/kernel/random/entropy_avail", O_RDONLY|O_CLOEXEC); if (e_fd < 0) return log_error_errno(errno, "Failed to open /proc/sys/kernel/random/entropy_avail: %m"); } n = pread(e_fd, buf, sizeof(buf) - 1, 0); if (n <= 0) { e_fd = safe_close(e_fd); } else { buf[n] = '\0'; sampledata->entropy_avail = atoi(buf); } } while ((ent = readdir(proc)) != NULL) { char filename[PATH_MAX]; int pid; struct ps_struct *ps; if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9')) continue; pid = atoi(ent->d_name); if (pid >= MAXPIDS) continue; ps = ps_first; while (ps->next_ps) { ps = ps->next_ps; if (ps->pid == pid) break; } /* end of our LL? then append a new record */ if (ps->pid != pid) { _cleanup_fclose_ FILE *st = NULL; char t[32]; struct ps_struct *parent; ps->next_ps = new0(struct ps_struct, 1); if (!ps->next_ps) return log_oom(); ps = ps->next_ps; ps->pid = pid; ps->sched = -1; ps->schedstat = -1; ps->sample = new0(struct ps_sched_struct, 1); if (!ps->sample) return log_oom(); ps->sample->sampledata = sampledata; (*pscount)++; /* mark our first sample */ ps->first = ps->last = ps->sample; ps->sample->runtime = atoll(rt); ps->sample->waittime = atoll(wt); /* get name, start time */ if (ps->sched < 0) { sprintf(filename, "%d/sched", pid); ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (ps->sched < 0) continue; } s = pread(ps->sched, buf, sizeof(buf) - 1, 0); if (s <= 0) { ps->sched = safe_close(ps->sched); continue; } buf[s] = '\0'; if (!sscanf(buf, "%s %*s %*s", key)) continue; strscpy(ps->name, sizeof(ps->name), key); /* cmdline */ if (arg_show_cmdline) pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid); /* discard line 2 */ m = bufgetline(buf); if (!m) continue; m = bufgetline(m); if (!m) continue; if (!sscanf(m, "%*s %*s %s", t)) continue; r = safe_atod(t, &ps->starttime); if (r < 0) continue; ps->starttime /= 1000.0; if (arg_show_cgroup) /* if this fails, that's OK */ cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, ps->pid, &ps->cgroup); /* ppid */ sprintf(filename, "%d/stat", pid); fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (fd < 0) continue; st = fdopen(fd, "re"); if (!st) { close(fd); continue; } if (!fscanf(st, "%*s %*s %*s %i", &p)) continue; ps->ppid = p; /* * setup child pointers * * these are used to paint the tree coherently later * each parent has a LL of children, and a LL of siblings */ if (pid == 1) continue; /* nothing to do for init atm */ /* kthreadd has ppid=0, which breaks our tree ordering */ if (ps->ppid == 0) ps->ppid = 1; parent = ps_first; while ((parent->next_ps && parent->pid != ps->ppid)) parent = parent->next_ps; if (parent->pid != ps->ppid) { /* orphan */ ps->ppid = 1; parent = ps_first->next_ps; } ps->parent = parent; if (!parent->children) { /* it's the first child */ parent->children = ps; } else { /* walk all children and append */ struct ps_struct *children; children = parent->children; while (children->next) children = children->next; children->next = ps; } } /* else -> found pid, append data in ps */ /* below here is all continuous logging parts - we get here on every * iteration */ /* rt, wt */ if (ps->schedstat < 0) { sprintf(filename, "%d/schedstat", pid); ps->schedstat = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (ps->schedstat < 0) continue; } s = pread(ps->schedstat, buf, sizeof(buf) - 1, 0); if (s <= 0) { /* clean up our file descriptors - assume that the process exited */ close(ps->schedstat); ps->schedstat = -1; ps->sched = safe_close(ps->sched); continue; } buf[s] = '\0'; if (!sscanf(buf, "%s %s %*s", rt, wt)) continue; ps->sample->next = new0(struct ps_sched_struct, 1); if (!ps->sample->next) return log_oom(); ps->sample->next->prev = ps->sample; ps->sample = ps->sample->next; ps->last = ps->sample; ps->sample->runtime = atoll(rt); ps->sample->waittime = atoll(wt); ps->sample->sampledata = sampledata; ps->sample->ps_new = ps; if (ps_prev) ps_prev->cross = ps->sample; ps_prev = ps->sample; ps->total = (ps->last->runtime - ps->first->runtime) / 1000000000.0; /* Take into account CPU runtime/waittime spent in non-main threads of the process * by parsing "/proc/[pid]/task/[tid]/schedstat" for all [tid] != [pid] * See https://github.com/systemd/systemd/issues/139 */ /* Browse directory "/proc/[pid]/task" to know the thread ids of process [pid] */ snprintf(filename, sizeof(filename), PID_FMT "/task", pid); taskfd = openat(procfd, filename, O_RDONLY|O_DIRECTORY|O_CLOEXEC); if (taskfd >= 0) { _cleanup_closedir_ DIR *taskdir = NULL; taskdir = fdopendir(taskfd); if (!taskdir) { safe_close(taskfd); return -errno; } FOREACH_DIRENT(ent, taskdir, break) { int tid = -1; _cleanup_close_ int tid_schedstat = -1; long long delta_rt; long long delta_wt; if ((ent->d_name[0] < '0') || (ent->d_name[0] > '9')) continue; /* Skip main thread as it was already accounted */ r = safe_atoi(ent->d_name, &tid); if (r < 0 || tid == pid) continue; /* Parse "/proc/[pid]/task/[tid]/schedstat" */ snprintf(filename, sizeof(filename), PID_FMT "/schedstat", tid); tid_schedstat = openat(taskfd, filename, O_RDONLY|O_CLOEXEC); if (tid_schedstat == -1) continue; s = pread(tid_schedstat, buf, sizeof(buf) - 1, 0); if (s <= 0) continue; buf[s] = '\0'; if (!sscanf(buf, "%s %s %*s", rt, wt)) continue; r = safe_atolli(rt, &delta_rt); if (r < 0) continue; r = safe_atolli(rt, &delta_wt); if (r < 0) continue; ps->sample->runtime += delta_rt; ps->sample->waittime += delta_wt; } } if (!arg_pss) goto catch_rename; /* Pss */ if (!ps->smaps) { sprintf(filename, "%d/smaps", pid); fd = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (fd < 0) continue; ps->smaps = fdopen(fd, "re"); if (!ps->smaps) { close(fd); continue; } setvbuf(ps->smaps, smaps_buf, _IOFBF, sizeof(smaps_buf)); } else { rewind(ps->smaps); } /* test to see if we need to skip another field */ if (skip == 0) { if (fgets(buf, sizeof(buf), ps->smaps) == NULL) { continue; } if (fread(buf, 1, 28 * 15, ps->smaps) != (28 * 15)) { continue; } if (buf[392] == 'V') { skip = 2; } else { skip = 1; } rewind(ps->smaps); } while (1) { int pss_kb; /* skip one line, this contains the object mapped. */ if (fgets(buf, sizeof(buf), ps->smaps) == NULL) { break; } /* then there's a 28 char 14 line block */ if (fread(buf, 1, 28 * 14, ps->smaps) != 28 * 14) { break; } pss_kb = atoi(&buf[61]); ps->sample->pss += pss_kb; /* skip one more line if this is a newer kernel */ if (skip == 2) { if (fgets(buf, sizeof(buf), ps->smaps) == NULL) break; } } if (ps->sample->pss > ps->pss_max) ps->pss_max = ps->sample->pss; catch_rename: /* catch process rename, try to randomize time */ mod = (arg_hz < 4.0) ? 4.0 : (arg_hz / 4.0); if (((sample - ps->pid) + pid) % (int)(mod) == 0) { /* re-fetch name */ /* get name, start time */ if (ps->sched < 0) { sprintf(filename, "%d/sched", pid); ps->sched = openat(procfd, filename, O_RDONLY|O_CLOEXEC); if (ps->sched < 0) continue; } s = pread(ps->sched, buf, sizeof(buf) - 1, 0); if (s <= 0) { /* clean up file descriptors */ ps->sched = safe_close(ps->sched); ps->schedstat = safe_close(ps->schedstat); continue; } buf[s] = '\0'; if (!sscanf(buf, "%s %*s %*s", key)) continue; strscpy(ps->name, sizeof(ps->name), key); /* cmdline */ if (arg_show_cmdline) pid_cmdline_strscpy(procfd, ps->name, sizeof(ps->name), pid); } } return 0; }