/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/

/***
  This file is part of systemd.

  Copyright 2010 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <ftw.h>

#include "cgroup-util.h"
#include "log.h"
#include "set.h"
#include "macro.h"
#include "util.h"

int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
        char *fs;
        int r;
        FILE *f;

        assert(controller);
        assert(path);
        assert(_f);

        if ((r = cg_get_path(controller, path, "cgroup.procs", &fs)) < 0)
                return r;

        f = fopen(fs, "re");
        free(fs);

        if (!f)
                return -errno;

        *_f = f;
        return 0;
}

int cg_enumerate_tasks(const char *controller, const char *path, FILE **_f) {
        char *fs;
        int r;
        FILE *f;

        assert(controller);
        assert(path);
        assert(_f);

        if ((r = cg_get_path(controller, path, "tasks", &fs)) < 0)
                return r;

        f = fopen(fs, "re");
        free(fs);

        if (!f)
                return -errno;

        *_f = f;
        return 0;
}

int cg_read_pid(FILE *f, pid_t *_pid) {
        unsigned long ul;

        /* Note that the cgroup.procs might contain duplicates! See
         * cgroups.txt for details. */

        errno = 0;
        if (fscanf(f, "%lu", &ul) != 1) {

                if (feof(f))
                        return 0;

                return errno ? -errno : -EIO;
        }

        if (ul <= 0)
                return -EIO;

        *_pid = (pid_t) ul;
        return 1;
}

int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
        char *fs;
        int r;
        DIR *d;

        assert(controller);
        assert(path);
        assert(_d);

        /* This is not recursive! */

        if ((r = cg_get_path(controller, path, NULL, &fs)) < 0)
                return r;

        d = opendir(fs);
        free(fs);

        if (!d)
                return -errno;

        *_d = d;
        return 0;
}

int cg_read_subgroup(DIR *d, char **fn) {
        struct dirent *de;

        assert(d);

        errno = 0;
        while ((de = readdir(d))) {
                char *b;

                if (de->d_type != DT_DIR)
                        continue;

                if (streq(de->d_name, ".") ||
                    streq(de->d_name, ".."))
                        continue;

                if (!(b = strdup(de->d_name)))
                        return -ENOMEM;

                *fn = b;
                return 1;
        }

        if (errno)
                return -errno;

        return 0;
}

int cg_rmdir(const char *controller, const char *path, bool honour_sticky) {
        char *p;
        int r;

        r = cg_get_path(controller, path, NULL, &p);
        if (r < 0)
                return r;

        if (honour_sticky) {
                char *tasks;

                /* If the sticky bit is set don't remove the directory */

                tasks = strappend(p, "/tasks");
                if (!tasks) {
                        free(p);
                        return -ENOMEM;
                }

                r = file_is_priv_sticky(tasks);
                free(tasks);

                if (r > 0) {
                        free(p);
                        return 0;
                }
        }

        r = rmdir(p);
        free(p);

        return (r < 0 && errno != ENOENT) ? -errno : 0;
}

int cg_kill(const char *controller, const char *path, int sig, bool sigcont, bool ignore_self, Set *s) {
        bool done = false;
        int r, ret = 0;
        pid_t my_pid;
        FILE *f = NULL;
        Set *allocated_set = NULL;

        assert(controller);
        assert(path);
        assert(sig >= 0);

        /* This goes through the tasks list and kills them all. This
         * is repeated until no further processes are added to the
         * tasks list, to properly handle forking processes */

        if (!s)
                if (!(s = allocated_set = set_new(trivial_hash_func, trivial_compare_func)))
                        return -ENOMEM;

        my_pid = getpid();

        do {
                pid_t pid = 0;
                done = true;

                if ((r = cg_enumerate_processes(controller, path, &f)) < 0) {
                        if (ret >= 0 && r != -ENOENT)
                                ret = r;

                        goto finish;
                }

                while ((r = cg_read_pid(f, &pid)) > 0) {

                        if (pid == my_pid && ignore_self)
                                continue;

                        if (set_get(s, LONG_TO_PTR(pid)) == LONG_TO_PTR(pid))
                                continue;

                        /* If we haven't killed this process yet, kill
                         * it */
                        if (kill(pid, sig) < 0) {
                                if (ret >= 0 && errno != ESRCH)
                                        ret = -errno;
                        } else if (ret == 0) {

                                if (sigcont)
                                        kill(pid, SIGCONT);

                                ret = 1;
                        }

                        done = false;

                        if ((r = set_put(s, LONG_TO_PTR(pid))) < 0) {
                                if (ret >= 0)
                                        ret = r;

                                goto finish;
                        }
                }

                if (r < 0) {
                        if (ret >= 0)
                                ret = r;

                        goto finish;
                }

                fclose(f);
                f = NULL;

                /* To avoid racing against processes which fork
                 * quicker than we can kill them we repeat this until
                 * no new pids need to be killed. */

        } while (!done);

finish:
        if (allocated_set)
                set_free(allocated_set);

        if (f)
                fclose(f);

        return ret;
}

int cg_kill_recursive(const char *controller, const char *path, int sig, bool sigcont, bool ignore_self, bool rem, Set *s) {
        int r, ret = 0;
        DIR *d = NULL;
        char *fn;
        Set *allocated_set = NULL;

        assert(path);
        assert(controller);
        assert(sig >= 0);

        if (!s)
                if (!(s = allocated_set = set_new(trivial_hash_func, trivial_compare_func)))
                        return -ENOMEM;

        ret = cg_kill(controller, path, sig, sigcont, ignore_self, s);

        if ((r = cg_enumerate_subgroups(controller, path, &d)) < 0) {
                if (ret >= 0 && r != -ENOENT)
                        ret = r;

                goto finish;
        }

        while ((r = cg_read_subgroup(d, &fn)) > 0) {
                char *p = NULL;

                r = asprintf(&p, "%s/%s", path, fn);
                free(fn);

                if (r < 0) {
                        if (ret >= 0)
                                ret = -ENOMEM;

                        goto finish;
                }

                r = cg_kill_recursive(controller, p, sig, sigcont, ignore_self, rem, s);
                free(p);

                if (r != 0 && ret >= 0)
                        ret = r;
        }

        if (r < 0 && ret >= 0)
                ret = r;

        if (rem)
                if ((r = cg_rmdir(controller, path, true)) < 0) {
                        if (ret >= 0 &&
                            r != -ENOENT &&
                            r != -EBUSY)
                                ret = r;
                }

finish:
        if (d)
                closedir(d);

        if (allocated_set)
                set_free(allocated_set);

        return ret;
}

int cg_kill_recursive_and_wait(const char *controller, const char *path, bool rem) {
        unsigned i;

        assert(path);
        assert(controller);

        /* This safely kills all processes; first it sends a SIGTERM,
         * then checks 8 times after 200ms whether the group is now
         * empty, then kills everything that is left with SIGKILL and
         * finally checks 5 times after 200ms each whether the group
         * is finally empty. */

        for (i = 0; i < 15; i++) {
                int sig, r;

                if (i <= 0)
                        sig = SIGTERM;
                else if (i == 9)
                        sig = SIGKILL;
                else
                        sig = 0;

                if ((r = cg_kill_recursive(controller, path, sig, true, true, rem, NULL)) <= 0)
                        return r;

                usleep(200 * USEC_PER_MSEC);
        }

        return 0;
}

int cg_migrate(const char *controller, const char *from, const char *to, bool ignore_self) {
        bool done = false;
        Set *s;
        int r, ret = 0;
        pid_t my_pid;
        FILE *f = NULL;

        assert(controller);
        assert(from);
        assert(to);

        if (!(s = set_new(trivial_hash_func, trivial_compare_func)))
                return -ENOMEM;

        my_pid = getpid();

        do {
                pid_t pid = 0;
                done = true;

                if ((r = cg_enumerate_tasks(controller, from, &f)) < 0) {
                        if (ret >= 0 && r != -ENOENT)
                                ret = r;

                        goto finish;
                }

                while ((r = cg_read_pid(f, &pid)) > 0) {

                        /* This might do weird stuff if we aren't a
                         * single-threaded program. However, we
                         * luckily know we are not */
                        if (pid == my_pid && ignore_self)
                                continue;

                        if (set_get(s, LONG_TO_PTR(pid)) == LONG_TO_PTR(pid))
                                continue;

                        if ((r = cg_attach(controller, to, pid)) < 0) {
                                if (ret >= 0 && r != -ESRCH)
                                        ret = r;
                        } else if (ret == 0)
                                ret = 1;

                        done = false;

                        if ((r = set_put(s, LONG_TO_PTR(pid))) < 0) {
                                if (ret >= 0)
                                        ret = r;

                                goto finish;
                        }
                }

                if (r < 0) {
                        if (ret >= 0)
                                ret = r;

                        goto finish;
                }

                fclose(f);
                f = NULL;

        } while (!done);

finish:
        set_free(s);

        if (f)
                fclose(f);

        return ret;
}

int cg_migrate_recursive(const char *controller, const char *from, const char *to, bool ignore_self, bool rem) {
        int r, ret = 0;
        DIR *d = NULL;
        char *fn;

        assert(controller);
        assert(from);
        assert(to);

        ret = cg_migrate(controller, from, to, ignore_self);

        if ((r = cg_enumerate_subgroups(controller, from, &d)) < 0) {
                if (ret >= 0 && r != -ENOENT)
                        ret = r;
                goto finish;
        }

        while ((r = cg_read_subgroup(d, &fn)) > 0) {
                char *p = NULL;

                r = asprintf(&p, "%s/%s", from, fn);
                free(fn);

                if (r < 0) {
                        if (ret >= 0)
                                ret = -ENOMEM;

                        goto finish;
                }

                r = cg_migrate_recursive(controller, p, to, ignore_self, rem);
                free(p);

                if (r != 0 && ret >= 0)
                        ret = r;
        }

        if (r < 0 && ret >= 0)
                ret = r;

        if (rem)
                if ((r = cg_rmdir(controller, from, true)) < 0) {
                        if (ret >= 0 &&
                            r != -ENOENT &&
                            r != -EBUSY)
                                ret = r;
                }

finish:
        if (d)
                closedir(d);

        return ret;
}

int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
        const char *p;
        char *t;
        static __thread bool good = false;

        assert(controller);
        assert(fs);

        if (_unlikely_(!good)) {
                int r;

                r = path_is_mount_point("/sys/fs/cgroup", false);
                if (r <= 0)
                        return r < 0 ? r : -ENOENT;

                /* Cache this to save a few stat()s */
                good = true;
        }

        if (isempty(controller))
                return -EINVAL;

        /* This is a very minimal lookup from controller names to
         * paths. Since we have mounted most hierarchies ourselves
         * should be kinda safe, but eventually we might want to
         * extend this to have a fallback to actually check
         * /proc/mounts. Might need caching then. */

        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
                p = "systemd";
        else if (startswith(controller, "name="))
                p = controller + 5;
        else
                p = controller;

        if (path && suffix)
                t = join("/sys/fs/cgroup/", p, "/", path, "/", suffix, NULL);
        else if (path)
                t = join("/sys/fs/cgroup/", p, "/", path, NULL);
        else if (suffix)
                t = join("/sys/fs/cgroup/", p, "/", suffix, NULL);
        else
                t = join("/sys/fs/cgroup/", p, NULL);

        if (!t)
                return -ENOMEM;

        path_kill_slashes(t);

        *fs = t;
        return 0;
}

static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
        char *p;
        bool is_sticky;

        if (typeflag != FTW_DP)
                return 0;

        if (ftwbuf->level < 1)
                return 0;

        p = strappend(path, "/tasks");
        if (!p) {
                errno = ENOMEM;
                return 1;
        }

        is_sticky = file_is_priv_sticky(p) > 0;
        free(p);

        if (is_sticky)
                return 0;

        rmdir(path);
        return 0;
}

int cg_trim(const char *controller, const char *path, bool delete_root) {
        char *fs;
        int r = 0;

        assert(controller);
        assert(path);

        r = cg_get_path(controller, path, NULL, &fs);
        if (r < 0)
                return r;

        errno = 0;
        if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) < 0)
                r = errno ? -errno : -EIO;

        if (delete_root) {
                bool is_sticky;
                char *p;

                p = strappend(fs, "/tasks");
                if (!p) {
                        free(fs);
                        return -ENOMEM;
                }

                is_sticky = file_is_priv_sticky(p) > 0;
                free(p);

                if (!is_sticky)
                        if (rmdir(fs) < 0 && errno != ENOENT) {
                                if (r == 0)
                                        r = -errno;
                        }
        }

        free(fs);

        return r;
}

int cg_delete(const char *controller, const char *path) {
        char *parent;
        int r;

        assert(controller);
        assert(path);

        if ((r = parent_of_path(path, &parent)) < 0)
                return r;

        r = cg_migrate_recursive(controller, path, parent, false, true);
        free(parent);

        return r == -ENOENT ? 0 : r;
}

int cg_attach(const char *controller, const char *path, pid_t pid) {
        char *fs;
        int r;
        char c[32];

        assert(controller);
        assert(path);
        assert(pid >= 0);

        if ((r = cg_get_path(controller, path, "tasks", &fs)) < 0)
                return r;

        if (pid == 0)
                pid = getpid();

        snprintf(c, sizeof(c), "%lu\n", (unsigned long) pid);
        char_array_0(c);

        r = write_one_line_file(fs, c);
        free(fs);

        return r;
}

int cg_set_group_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid) {
        char *fs;
        int r;

        assert(controller);
        assert(path);

        if (mode != (mode_t) -1)
                mode &= 0777;

        r = cg_get_path(controller, path, NULL, &fs);
        if (r < 0)
                return r;

        r = chmod_and_chown(fs, mode, uid, gid);
        free(fs);

        return r;
}

int cg_set_task_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid, int sticky) {
        char *fs;
        int r;

        assert(controller);
        assert(path);

        if (mode == (mode_t) -1 && uid == (uid_t) -1 && gid == (gid_t) -1 && sticky < 0)
                return 0;

        if (mode != (mode_t) -1)
                mode &= 0666;

        r = cg_get_path(controller, path, "tasks", &fs);
        if (r < 0)
                return r;

        if (sticky >= 0 && mode != (mode_t) -1)
                /* Both mode and sticky param are passed */
                mode |= (sticky ? S_ISVTX : 0);
        else if ((sticky >= 0 && mode == (mode_t) -1) ||
                 (mode != (mode_t) -1 && sticky < 0)) {
                struct stat st;

                /* Only one param is passed, hence read the current
                 * mode from the file itself */

                r = lstat(fs, &st);
                if (r < 0) {
                        free(fs);
                        return -errno;
                }

                if (mode == (mode_t) -1)
                        /* No mode set, we just shall set the sticky bit */
                        mode = (st.st_mode & ~S_ISVTX) | (sticky ? S_ISVTX : 0);
                else
                        /* Only mode set, leave sticky bit untouched */
                        mode = (st.st_mode & ~0777) | mode;
        }

        r = chmod_and_chown(fs, mode, uid, gid);
        free(fs);

        return r;
}

int cg_get_by_pid(const char *controller, pid_t pid, char **path) {
        int r;
        char *p = NULL;
        FILE *f;
        char *fs;
        size_t cs;

        assert(controller);
        assert(path);
        assert(pid >= 0);

        if (pid == 0)
                pid = getpid();

        if (asprintf(&fs, "/proc/%lu/cgroup", (unsigned long) pid) < 0)
                return -ENOMEM;

        f = fopen(fs, "re");
        free(fs);

        if (!f)
                return errno == ENOENT ? -ESRCH : -errno;

        cs = strlen(controller);

        while (!feof(f)) {
                char line[LINE_MAX];
                char *l;

                errno = 0;
                if (!(fgets(line, sizeof(line), f))) {
                        if (feof(f))
                                break;

                        r = errno ? -errno : -EIO;
                        goto finish;
                }

                truncate_nl(line);

                if (!(l = strchr(line, ':')))
                        continue;

                l++;
                if (strncmp(l, controller, cs) != 0)
                        continue;

                if (l[cs] != ':')
                        continue;

                if (!(p = strdup(l + cs + 1))) {
                        r = -ENOMEM;
                        goto finish;
                }

                *path = p;
                r = 0;
                goto finish;
        }

        r = -ENOENT;

finish:
        fclose(f);

        return r;
}

int cg_install_release_agent(const char *controller, const char *agent) {
        char *fs = NULL, *contents = NULL, *line = NULL, *sc;
        int r;

        assert(controller);
        assert(agent);

        if ((r = cg_get_path(controller, NULL, "release_agent", &fs)) < 0)
                return r;

        if ((r = read_one_line_file(fs, &contents)) < 0)
                goto finish;

        sc = strstrip(contents);
        if (sc[0] == 0) {

                if (asprintf(&line, "%s\n", agent) < 0) {
                        r = -ENOMEM;
                        goto finish;
                }

                if ((r = write_one_line_file(fs, line)) < 0)
                        goto finish;

        } else if (!streq(sc, agent)) {
                r = -EEXIST;
                goto finish;
        }

        free(fs);
        fs = NULL;
        if ((r = cg_get_path(controller, NULL, "notify_on_release", &fs)) < 0)
                goto finish;

        free(contents);
        contents = NULL;
        if ((r = read_one_line_file(fs, &contents)) < 0)
                goto finish;

        sc = strstrip(contents);

        if (streq(sc, "0")) {
                if ((r = write_one_line_file(fs, "1\n")) < 0)
                        goto finish;

                r = 1;
        } else if (!streq(sc, "1")) {
                r = -EIO;
                goto finish;
        } else
                r = 0;

finish:
        free(fs);
        free(contents);
        free(line);

        return r;
}

int cg_is_empty(const char *controller, const char *path, bool ignore_self) {
        pid_t pid = 0;
        int r;
        FILE *f = NULL;
        bool found = false;

        assert(controller);
        assert(path);

        if ((r = cg_enumerate_tasks(controller, path, &f)) < 0)
                return r == -ENOENT ? 1 : r;

        while ((r = cg_read_pid(f, &pid)) > 0) {

                if (ignore_self && pid == getpid())
                        continue;

                found = true;
                break;
        }

        fclose(f);

        if (r < 0)
                return r;

        return !found;
}

int cg_is_empty_recursive(const char *controller, const char *path, bool ignore_self) {
        int r;
        DIR *d = NULL;
        char *fn;

        assert(controller);
        assert(path);

        if ((r = cg_is_empty(controller, path, ignore_self)) <= 0)
                return r;

        if ((r = cg_enumerate_subgroups(controller, path, &d)) < 0)
                return r == -ENOENT ? 1 : r;

        while ((r = cg_read_subgroup(d, &fn)) > 0) {
                char *p = NULL;

                r = asprintf(&p, "%s/%s", path, fn);
                free(fn);

                if (r < 0) {
                        r = -ENOMEM;
                        goto finish;
                }

                r = cg_is_empty_recursive(controller, p, ignore_self);
                free(p);

                if (r <= 0)
                        goto finish;
        }

        if (r >= 0)
                r = 1;

finish:

        if (d)
                closedir(d);

        return r;
}

int cg_split_spec(const char *spec, char **controller, char **path) {
        const char *e;
        char *t = NULL, *u = NULL;

        assert(spec);
        assert(controller || path);

        if (*spec == '/') {

                if (path) {
                        if (!(t = strdup(spec)))
                                return -ENOMEM;

                        *path = t;
                }

                if (controller)
                        *controller = NULL;

                return 0;
        }

        if (!(e = strchr(spec, ':'))) {

                if (strchr(spec, '/') || spec[0] == 0)
                        return -EINVAL;

                if (controller) {
                        if (!(t = strdup(spec)))
                                return -ENOMEM;

                        *controller = t;
                }

                if (path)
                        *path = NULL;

                return 0;
        }

        if (e[1] != '/' ||
            e == spec ||
            memchr(spec, '/', e-spec))
                return -EINVAL;

        if (controller)
                if (!(t = strndup(spec, e-spec)))
                        return -ENOMEM;

        if (path)
                if (!(u = strdup(e+1))) {
                        free(t);
                        return -ENOMEM;
                }

        if (controller)
                *controller = t;

        if (path)
                *path = u;

        return 0;
}

int cg_join_spec(const char *controller, const char *path, char **spec) {
        assert(controller);
        assert(path);

        if (!path_is_absolute(path) ||
            controller[0] == 0 ||
            strchr(controller, ':') ||
            strchr(controller, '/'))
                return -EINVAL;

        if (asprintf(spec, "%s:%s", controller, path) < 0)
                return -ENOMEM;

        return 0;
}

int cg_fix_path(const char *path, char **result) {
        char *t, *c, *p;
        int r;

        assert(path);
        assert(result);

        /* First check if it already is a filesystem path */
        if (path_is_absolute(path) &&
            path_startswith(path, "/sys/fs/cgroup") &&
            access(path, F_OK) >= 0) {

                if (!(t = strdup(path)))
                        return -ENOMEM;

                *result = t;
                return 0;
        }

        /* Otherwise treat it as cg spec */
        if ((r = cg_split_spec(path, &c, &p)) < 0)
                return r;

        r = cg_get_path(c ? c : SYSTEMD_CGROUP_CONTROLLER, p ? p : "/", NULL, result);
        free(c);
        free(p);

        return r;
}

int cg_get_user_path(char **path) {
        char *root, *p;

        assert(path);

        /* Figure out the place to put user cgroups below. We use the
         * same as PID 1 has but with the "/system" suffix replaced by
         * "/user" */

        if (cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 1, &root) < 0)
                p = strdup("/user");
        else {
                if (endswith(root, "/system"))
                        root[strlen(root) - 7] = 0;
                else if (streq(root, "/"))
                        root[0] = 0;

                p = strappend(root, "/user");
                free(root);
        }

        if (!p)
                return -ENOMEM;

        *path = p;
        return 0;
}