From 548bd57376f7eb82cc792f0476688ccc9843962a Mon Sep 17 00:00:00 2001 From: Evgeny Vereshchagin Date: Thu, 20 Oct 2016 09:01:45 +0000 Subject: basic: fallback to the fstat if we don't have access to the /proc/self/fdinfo https://github.com/systemd/systemd/pull/4372#discussion_r83354107: I get `open("/proc/self/fdinfo/13", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied)` 327 mkdir("/proc", 0755 327 <... mkdir resumed> ) = -1 EEXIST (File exists) 327 stat("/proc", 327 <... stat resumed> {st_dev=makedev(8, 1), st_ino=28585, st_mode=S_IFDIR|0755, st_nlink=2, st_uid=0, st_gid=0, st_blksize=1024, st_blocks=4, st_size=1024, st_atime=2016/10/14-02:55:32, st_mtime=2016/ 327 mount("proc", "/proc", "proc", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL 327 <... mount resumed> ) = 0 327 lstat("/proc", 327 <... lstat resumed> {st_dev=makedev(0, 34), st_ino=1, st_mode=S_IFDIR|0555, st_nlink=75, st_uid=65534, st_gid=65534, st_blksize=1024, st_blocks=0, st_size=0, st_atime=2016/10/14-03:13:35.971031263, 327 lstat("/proc/sys", {st_dev=makedev(0, 34), st_ino=4026531855, st_mode=S_IFDIR|0555, st_nlink=1, st_uid=65534, st_gid=65534, st_blksize=1024, st_blocks=0, st_size=0, st_atime=2016/10/14-03:13:39.1630 327 openat(AT_FDCWD, "/proc", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_PATH) = 11 327 name_to_handle_at(11, "sys", {handle_bytes=128}, 0x7ffe3a238604, AT_SYMLINK_FOLLOW) = -1 EOPNOTSUPP (Operation not supported) 327 name_to_handle_at(11, "", {handle_bytes=128}, 0x7ffe3a238608, AT_EMPTY_PATH) = -1 EOPNOTSUPP (Operation not supported) 327 openat(11, "sys", O_RDONLY|O_CLOEXEC|O_PATH) = 13 327 open("/proc/self/fdinfo/13", O_RDONLY|O_CLOEXEC) = -1 EACCES (Permission denied) 327 close(13 327 <... close resumed> ) = 0 327 close(11 327 <... close resumed> ) = 0 -bash-4.3# ls -ld /proc/ dr-xr-xr-x 76 65534 65534 0 Oct 14 02:57 /proc/ -bash-4.3# ls -ld /proc/1 dr-xr-xr-x 9 root root 0 Oct 14 02:57 /proc/1 -bash-4.3# ls -ld /proc/1/fdinfo dr-x------ 2 65534 65534 0 Oct 14 03:00 /proc/1/fdinfo --- src/basic/mount-util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/basic/mount-util.c b/src/basic/mount-util.c index 0ef00676ef..2985cc475a 100644 --- a/src/basic/mount-util.c +++ b/src/basic/mount-util.c @@ -162,7 +162,7 @@ int fd_is_mount_point(int fd, const char *filename, int flags) { fallback_fdinfo: r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id); - if (r == -EOPNOTSUPP) + if (IN_SET(r, -EOPNOTSUPP, -EACCES)) goto fallback_fstat; if (r < 0) return r; -- cgit v1.2.3-54-g00ecf From 63eae72312b6b8df4c7186233994a65d747229a7 Mon Sep 17 00:00:00 2001 From: Evgeny Vereshchagin Date: Thu, 20 Oct 2016 09:03:40 +0000 Subject: nspawn: really lchown(uid/gid) https://github.com/systemd/systemd/pull/4372#issuecomment-253723849: * `mount_all (outer_child)` creates `container_dir/sys/fs/selinux` * `mount_all (outer_child)` doesn't patch `container_dir/sys/fs` and so on. * `mount_sysfs (inner_child)` tries to create `/sys/fs/cgroup` * This fails 370 stat("/sys/fs", {st_dev=makedev(0, 28), st_ino=13880, st_mode=S_IFDIR|0755, st_nlink=3, st_uid=65534, st_gid=65534, st_blksize=4096, st_blocks=0, st_size=60, st_atime=2016/10/14-05:16:43.398665943, st_mtime=2016/10/14-05:16:43.399665943, st_ctime=2016/10/14-05:16:43.399665943}) = 0 370 mkdir("/sys/fs/cgroup", 0755) = -1 EACCES (Permission denied) * `mount_syfs (inner_child)` ignores that error and mount(NULL, "/sys", NULL, MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_REMOUNT|MS_BIND, NULL) = 0 * `mount_cgroups` finally fails --- src/nspawn/nspawn-mount.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 44dc9bfcf4..115de64cf9 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -300,6 +300,59 @@ int mount_sysfs(const char *dest) { MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL); } +static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) { + int r; + + assert(path); + + r = mkdir(path, mode); + if (r < 0 && errno != EEXIST) + return -errno; + + if (!in_userns) { + r = lchown(path, uid_shift, uid_shift); + if (r < 0) + return -errno; + } + + return 0; +} + +static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) { + const char *p, *e; + int r; + + assert(path); + + if (prefix && !path_startswith(path, prefix)) + return -ENOTDIR; + + /* create every parent directory in the path, except the last component */ + p = path + strspn(path, "/"); + for (;;) { + char t[strlen(path) + 1]; + + e = p + strcspn(p, "/"); + p = e + strspn(e, "/"); + + /* Is this the last component? If so, then we're done */ + if (*p == 0) + break; + + memcpy(t, path, e - path); + t[e-path] = 0; + + if (prefix && path_startswith(prefix, t)) + continue; + + r = mkdir_userns(t, mode, in_userns, uid_shift); + if (r < 0) + return r; + } + + return mkdir_userns(path, mode, in_userns, uid_shift); +} + int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, @@ -361,7 +414,7 @@ int mount_all(const char *dest, if (mount_table[k].what && r > 0) continue; - r = mkdir_p(where, 0755); + r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift); if (r < 0 && r != -EEXIST) { if (mount_table[k].fatal) return log_error_errno(r, "Failed to create directory %s: %m", where); -- cgit v1.2.3-54-g00ecf From 6d66bd3b2a7ebe99aa7fcd06df9bc05b178a142a Mon Sep 17 00:00:00 2001 From: Evgeny Vereshchagin Date: Thu, 20 Oct 2016 09:05:46 +0000 Subject: nspawn: become a new root early https://github.com/torvalds/linux/commit/036d523641c66bef713042894a17f4335f199e49 > vfs: Don't create inodes with a uid or gid unknown to the vfs It is expected that filesystems can not represent uids and gids from outside of their user namespace. Keep things simple by not even trying to create filesystem nodes with non-sense uids and gids. So, we actually should `reset_uid_gid` early to prevent https://github.com/systemd/systemd/pull/4223#issuecomment-252522955 $ sudo UNIFIED_CGROUP_HIERARCHY=no LD_LIBRARY_PATH=.libs .libs/systemd-nspawn -D /var/lib/machines/fedora-rawhide -U -b systemd.unit=multi-user.target Spawning container fedora-rawhide on /var/lib/machines/fedora-rawhide. Press ^] three times within 1s to kill container. Child died too early. Selected user namespace base 1073283072 and range 65536. Failed to mount to /sys/fs/cgroup/systemd: No such file or directory Details: https://github.com/systemd/systemd/pull/4223#issuecomment-253046519 Fixes: #4352 --- src/nspawn/nspawn.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 2cbe563953..295293858e 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2684,6 +2684,10 @@ static int inner_child( } } + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Couldn't become new root: %m"); + r = mount_all(NULL, arg_userns_mode != USER_NAMESPACE_NO, true, @@ -2726,10 +2730,6 @@ static int inner_child( return r; } - r = reset_uid_gid(); - if (r < 0) - return log_error_errno(r, "Couldn't become new root: %m"); - r = setup_boot_id(NULL); if (r < 0) return r; -- cgit v1.2.3-54-g00ecf