├── .gitignore ├── COPYING ├── Makefile ├── README ├── TIPS ├── console.c ├── contain.c ├── contain.h ├── inject.c ├── map.c ├── mount.c ├── pseudo.c └── util.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | /contain 3 | /inject 4 | /pseudo 5 | /tags 6 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (C) 2013 Chris Webb 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BINDIR := $(PREFIX)/bin 2 | CFLAGS := -Os -Wall -Wfatal-errors 3 | 4 | BINARIES := inject 5 | SUIDROOT := contain pseudo 6 | 7 | %:: %.c Makefile 8 | $(CC) $(CFLAGS) -o $@ $(filter %.c,$^) 9 | 10 | all: $(BINARIES) $(SUIDROOT) 11 | 12 | contain: contain.[ch] console.c map.c mount.c util.c 13 | 14 | inject: contain.h inject.c map.c util.c 15 | 16 | pseudo: contain.h pseudo.c map.c util.c 17 | 18 | clean: 19 | rm -f $(BINARIES) $(SUIDROOT) 20 | 21 | install: $(BINARIES) $(SUIDROOT) 22 | mkdir -p $(DESTDIR)$(BINDIR) 23 | install -s $(BINARIES) $(DESTDIR)$(BINDIR) 24 | install -o root -g root -m 4755 -s $(SUIDROOT) $(DESTDIR)$(BINDIR) 25 | 26 | .PHONY: all clean install 27 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Containers 2 | ========== 3 | 4 | This package is a simple implementation of containers for Linux, making 5 | secure containers as easy to create and use as a traditional chroot. It 6 | comprises three utilities, contain, inject and pseudo, which use the kernel 7 | support for user namespaces merged in Linux 3.8. 8 | 9 | 10 | Demonstration 11 | ------------- 12 | 13 | With the utilities already installed, the demo begins in an unprivileged 14 | user's shell: 15 | 16 | $ echo $$ $UID 17 | 21260 1000 18 | 19 | To create a simple test container, copy /bin and /lib* from the host into a 20 | temporary directory with the default UID/GID mappings applied: 21 | 22 | $ cd $(mktemp -d) 23 | $ tar -c -f - -C / bin lib lib32 lib64 | pseudo tar -x -f - 24 | 25 | It is very straightforward to launch a container with this newly-created 26 | root filesystem: 27 | 28 | $ contain . /bin/bash 29 | # 30 | 31 | The new shell has PID 1 within the container, and cannot see other processes 32 | on the host: 33 | 34 | # echo $$ $UID 35 | 1 0 36 | # ps ax 37 | PID TTY STAT TIME COMMAND 38 | 1 console Ss 0:00 /bin/bash 39 | 2 console R+ 0:00 ps ax 40 | 41 | The container root user is able to manipulate ownerships and permissions 42 | within its filesystem: 43 | 44 | # ls -l /dev/console 45 | crw--w---- 1 0 5 136, 9 Jul 1 14:00 /dev/console 46 | # chown 12:34 /dev/console 47 | # chmod a+rw /dev/console 48 | # ls -l /dev/console 49 | crw-rw-rw- 1 12 34 136, 9 Jul 1 14:00 /dev/console 50 | 51 | and can also make other privileged changes such as setting the hostname: 52 | 53 | # echo -n "hostname $(hostname) -> " && hostname brian && hostname 54 | hostname alice -> brian 55 | 56 | or configuring the network stack: 57 | 58 | # ip link show 59 | 1: lo: mtu 65536 qdisc noop state DOWN mode DEFAULT 60 | link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 61 | # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down 62 | down 63 | # ip addr add 1.2.3.4/32 dev lo && ip link set lo up 64 | # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down 65 | up 66 | # ip link add type veth && ip link show 67 | 1: lo: mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT 68 | link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 69 | 2: veth0: mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 70 | link/ether 3a:0c:96:36:2d:ff brd ff:ff:ff:ff:ff:ff 71 | 3: veth1: mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 72 | link/ether a2:86:1a:92:58:cb brd ff:ff:ff:ff:ff:ff 73 | 74 | In all cases, these changes affect the container but not the host as a 75 | whole. Processes in the container live in different resource namespaces 76 | isolated from the host, and the container root user is unable to do anything 77 | that would require elevated capabilities or root privilege on the host 78 | itself. 79 | 80 | 81 | contain 82 | ------- 83 | 84 | The contain utility is invoked as 85 | 86 | contain [OPTIONS] DIR [CMD [ARG]...] 87 | 88 | with options 89 | 90 | -c disable console emulation in the container 91 | -g MAP set the container-to-host GID map 92 | -i CMD run a helper child inside the new namespaces 93 | -n share the host network unprivileged in the container 94 | -o CMD run a helper child outside the new namespaces 95 | -u MAP set the container-to-host UID map 96 | 97 | and creates a new container with DIR recursively bound as its root 98 | filesystem, running CMD as PID 1 within that container. If unspecified, CMD 99 | defaults to /bin/sh to start a shell, so to fully boot a distribution, 100 | specify CMD as /bin/init or /sbin/init. 101 | 102 | The container init process is isolated in new user, cgroup, mount, IPC, UTS, 103 | time and PID namespaces. A synthetic /dev with device nodes bound from the 104 | host /dev is automatically mounted within the new mount namespace, together 105 | with standard /dev/pts, /proc and /sys filesystems. 106 | 107 | Because it runs in its own user namespace, users and groups seen inside a 108 | container are not the same as the underlying credentials visible for the 109 | same processes and files on the host. Sensible default container-to-host UID 110 | and GID mappings are provided and described below, but the -u and -g options 111 | can be used to override the defaults. 112 | 113 | The container console is a host pseudo-terminal bound at /dev/console in the 114 | new /dev filesystem: stdin and stdout are copied to/from this, and it serves 115 | as stdin, stdout and stderr for the container init process. This console 116 | emulation can be disabled using the -c option: if -c is used, init is run 117 | directly with the stdin, stdout and stderr of the contain command. 118 | 119 | Containers are usually isolated in their own network namespace, with a 120 | distinct set of network interfaces from the host. By specifying the -n 121 | option, it is possible to safely share the host network stack instead. If 122 | you do this, user networking within the container will work normally, but 123 | the container has no privileges with respect to its network namespace so it 124 | isn't possible to (re)configure interfaces or routes, and setuid utilities 125 | like ping which use a raw socket will fail. 126 | 127 | Two different kinds of helper program can be used to help set up a 128 | container. A program specified with -i is run inside the new namespaces with 129 | the new root filesystem as its working directory, just before pivoting into 130 | it. Typically this type of helper is used to bind-mount additional parts of 131 | the host filesystem inside the container. 132 | 133 | A helper specified with -o is run outside the namespaces but as a direct 134 | child of the supervisor process which is running within them. This type of 135 | helper can be used to move host network interfaces (such as a macvtap 136 | interface or one half of a veth pair) into the container's network 137 | namespace. 138 | 139 | The environment of the container init process includes "container=contain" 140 | so that distributions can identify when they are running under contain. 141 | 142 | 143 | inject 144 | ------ 145 | 146 | The inject utility is invoked as 147 | 148 | inject PID [CMD [ARG]...] 149 | 150 | where PID is the process ID of a running container supervisor, and runs a 151 | command or shell inside the existing container. The environment, stdin, 152 | stdout and stderr of inject are all inherited by the command to be run. 153 | 154 | The container supervisor PID (i.e. that of contain itself) should be given 155 | to inject, not the PID of the descendant init process. The inject utility 156 | will only work if the process specified has a child with "container=contain" 157 | in its environment, which it assumes to be the container init. 158 | 159 | Linux allows an unprivileged user to join the user namespace of any process 160 | he can dump or ptrace, so inject need not be installed setuid even if 161 | contain and pseudo are setuid root. It will refuse to run if it detects 162 | setuid/setgid operation. 163 | 164 | 165 | pseudo 166 | ------ 167 | 168 | The pseudo utility is invoked as 169 | 170 | pseudo [OPTIONS] [CMD [ARG]...] 171 | 172 | with options 173 | 174 | -g MAP set the user namespace GID map 175 | -u MAP set the user namespace UID map 176 | 177 | and runs a command or shell as root in a new user namespace, by analogy with 178 | sudo which runs a command as root in the host user namespace. 179 | 180 | Unlike contain, pseudo does not unshare other namespaces or attempt to 181 | isolate the new process from the rest of the host. It has identical default 182 | UID/GID mappings, -u and -g options, and support for /etc/subuid and 183 | /etc/subgid when installed setuid root, but no other contain options are 184 | supported. 185 | 186 | One use for pseudo is as a more capable replacement for fakeroot, useful for 187 | testing, when building software packages or for constructing system images. 188 | Unlike the traditional fakeroot approach based on LD_PRELOAD, static 189 | binaries and chroot jails are both handled correctly. 190 | 191 | It is also invaluable for running host software to access the same 192 | filesystem as a container, replicating the user and group file ownerships 193 | that the container would see. For example, in the demo above, the system 194 | image is untarred under pseudo so that files are written into the filesystem 195 | with UIDs and GIDs mapped for the container rather than unmapped as on the 196 | host. 197 | 198 | 199 | User and group mappings 200 | ----------------------- 201 | 202 | By default, when run as root, contain and pseudo will map container UID/GID 203 | 0 onto the highest available host UID/GID (4294967294 unless nested), and 204 | all other UIDs/GIDs are mapped onto themselves apart from the top container 205 | UID and GID which must be left unmapped. 206 | 207 | The default mappings avoid host UID and GID 0 as the host root user is still 208 | granted a variety of privileges even after dropping all capabilities in the 209 | host user namespace. For example, /proc and /sys files typically have (host) 210 | root:root ownership, and allowing the container access unfiltered access to 211 | things like /proc/sys is dangerous. 212 | 213 | Run as an unprivileged user, container UID/GID 0 is mapped onto the 214 | unprivileged user's UID/GID, then container UIDs/GIDs 1, 2, etc. are 215 | successively mapped onto any ranges delegated to that user in /etc/subuid 216 | and /etc/subgid. 217 | 218 | The -u and -g options can be used to specify custom mappings, in the format 219 | START:LOWER:COUNT[,START:LOWER:COUNT]... where START is the first UID/GID in 220 | a container range, LOWER is the first UID/GID in the corresponding range in 221 | the host, and COUNT is the length of these ranges. 222 | 223 | For example, -u 0:1000:1,1:4000:2000 will map container UID 0 onto host UID 224 | 1000 and container UIDs 1...2000 onto host UIDs 4000...5999. 225 | 226 | It is not possible to map more than one container ID onto a given host ID, 227 | nor to list the same container ID twice in a map specification. When invoked 228 | by an unprivileged user, all host ranges are checked against /etc/subuid and 229 | /etc/subgid. 230 | 231 | Unmapped users and groups are mapped by the kernel onto the overflow UID and 232 | GID set in /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. By 233 | default the kernel sets both these values to 65534. 234 | 235 | 236 | Unprivileged operation, /etc/subuid and /etc/subgid 237 | --------------------------------------------------- 238 | 239 | When a non-root user runs contain or pseudo unprivileged, these tools can 240 | only map container UID/GIDs onto the host UID/GID of that user. The 241 | resulting container is not very useful as it has just a single user and 242 | group available. (Typically only root is mapped in the container.) 243 | 244 | However, contain and pseudo can also be installed setuid root, and in this 245 | case, unprivileged users can also map onto ranges of UIDs/GIDs that have 246 | been delegated for their use in /etc/subuid and /etc/subgid. 247 | 248 | The format of these files is similar to /etc/passwd, /etc/group and 249 | /etc/shadow. Each line specifies an additional range of UIDs/GIDs allocated 250 | to a particular user, and there can be zero, one, or multiple lines for any 251 | given user. There are three colon-delimited fields: the user's login name, 252 | the first UID/GID in the range, and the number of UIDs/GIDs in the range. 253 | For example, an /etc/subuid containing the lines 254 | 255 | chris:100000:10000 256 | chris:120000:10000 257 | 258 | allocates UID ranges 100000-109999 and 120000-129999 to my user 'chris' in 259 | addition to my normal login UID. 260 | 261 | The kernel user namespace author Eric Biederman has 262 | proposed patches against the standard GNU/Linux Shadow package which add 263 | support for creating and updating these files in this format; they are 264 | likely to become a standard way to delegate sub-users and sub-groups. 265 | 266 | Linux 3.19 and later do not allow unprivileged processes to write a GID map 267 | unless the setgroups() call has been permanently disabled by writing "deny" 268 | to /proc/PID/setgroups. This is a fix for CVE-2014-8989 which applied to 269 | strangely-configured systems where group membership implies more restricted 270 | permissions rather than supplementary permissions. 271 | 272 | As a result, when run non-setuid by an unprivileged user, contain and pseudo 273 | must disable setgroups() in the container. Conversely, when installed setuid 274 | root, they will use their privilege to bypass this kernel restriction, 275 | resulting in fully-functional containers which still support setgroups(). 276 | However, this also means that they can be used to bypass restrictions 277 | implemented by group membership. 278 | 279 | 280 | Building and installing 281 | ----------------------- 282 | 283 | Unpack the source tar.gz file and change to the unpacked directory. 284 | 285 | Run 'make', then 'make install' as root to install both binaries setuid root 286 | in /bin. Alternatively, you can set DESTDIR and/or BINDIR to install in a 287 | different location, or strip and copy the compiled binaries into the correct 288 | place manually. 289 | 290 | Note that setuid contain and pseudo effectively enable unprivileged users to 291 | to drop supplementary group memberships using setgroups(). Consequently, 292 | they should NOT be installed setuid root on systems where group membership 293 | implies more restricted permissions rather than supplementary permissions. 294 | 295 | These utilities were developed on GNU/Linux and are not portable to other 296 | platforms as they rely on Linux-specific facilities such as namespaces. 297 | Please report any problems or bugs to Chris Webb . 298 | 299 | 300 | Copying 301 | ------- 302 | 303 | This software was written by Chris Webb and is 304 | distributed as Free Software under the terms of the MIT license in COPYING. 305 | -------------------------------------------------------------------------------- /TIPS: -------------------------------------------------------------------------------- 1 | Shutting down or killing a container 2 | ------------------------------------ 3 | 4 | From the host, the inject utility can be used to run an appropriate command 5 | within the container to start a graceful shut down. For example 6 | 7 | inject PID /bin/halt 8 | 9 | To immediately kill a container and all its processes, it is sufficient to 10 | send the init process a SIGKILL from the host using 11 | 12 | pkill -KILL -P PID 13 | 14 | where PID is the process ID of a running container supervisor. It is very 15 | important not to SIGKILL the container supervisor itself or the container 16 | will be orphaned, continuing to run unsupervised as a child of the host 17 | init. 18 | 19 | 20 | Using cgroups to limit memory and processes available to a container 21 | -------------------------------------------------------------------- 22 | 23 | If cgroup support, the memory controller and the pids controller are 24 | compiled into the kernel, a mounted cgroup2 filesystem can be used to apply 25 | memory and process-count limits to a container as it is started. For 26 | example, the shell script 27 | 28 | #!/bin/sh -e 29 | echo +memory +pids >/sys/fs/cgroup/cgroup.subtree_control 30 | mkdir /sys/fs/cgroup/mycontainer 31 | echo $$ >/sys/fs/cgroup/mycontainer/tasks 32 | echo 2G >/sys/fs/cgroup/mycontainer/memory.high 33 | echo 3G >/sys/fs/cgroup/mycontainer/memory.max 34 | echo 2G >/sys/fs/cgroup/mycontainer/memory.swap.max 35 | echo 256 >sys/fs/cgroup/mycontainer/pids.max 36 | exec contain [...] 37 | 38 | applies a best-efforts limit of 2GB memory with a hard limit of 3GB. Swap 39 | usage is restricted to at most 2G, and no more than 256 process can be 40 | forked within the container. 41 | 42 | In addition, if contain is built and run on Linux 4.6 or later, a cgroup 43 | namespace will be used to virtualise the container's view of the cgroup 44 | hierarchy in /sys/fs/cgroup and /proc/*/cgroup. /sys/fs/cgroup/mycontainer 45 | will appear as the root of the hierarchy at /sys/fs/cgroup within the 46 | container. 47 | 48 | See linux/kernel/Documentation/cgroup-v2.txt for detailed info on the 49 | available controllers and configuration parameters. 50 | 51 | 52 | Troubleshooting 53 | --------------- 54 | 55 | The contain/psuedo error message 'Failed to unshare user namespace: Invalid 56 | argument' typically means that your kernel is not compiled with support for 57 | user namespaces, i.e. CONFIG_USER_NS is not set. The contain tool will also 58 | die with a similar message referring to one of the other required namespaces 59 | if support for that is not available in the kernel. 60 | 61 | To run these tools you need to be running Linux 3.8 or later with 62 | 63 | CONFIG_CGROUPS=y 64 | CONFIG_UTS_NS=y 65 | CONFIG_TIME_NS=y 66 | CONFIG_IPC_NS=y 67 | CONFIG_USER_NS=y 68 | CONFIG_PID_NS=y 69 | CONFIG_NET_NS=y 70 | 71 | set in the kernel build config. Note that before Linux 3.12, CONFIG_XFS_FS 72 | conflicted with CONFIG_USER_NS, so these tools could not be used where XFS 73 | support was compiled either into the kernel or as a module. 74 | 75 | The contain tool will fail to mount /dev/pts unless 76 | 77 | CONFIG_DEVPTS_MULTIPLE_INSTANCES=y 78 | 79 | is set in the kernel build config. Both container and host /dev/pts must be 80 | mounted with -o newinstance, with /dev/ptmx symlinked to pts/ptmx. 81 | 82 | Linux 3.12 introduced tighter restrictions on mounting proc and sysfs, which 83 | broke older versions of contain. To comply with these new rules, contain 84 | now ensures that procfs and sysfs are mounted in the new mount namespace 85 | before pivoting into the container and detaching the host root. 86 | 87 | A bug in Linux 3.12 will prevent contain from mounting /proc in a container 88 | if binfmt_misc is mounted on /proc/sys/fs/binfmt_misc in the host 89 | filesystem. This was fixed in Linux 3.13. 90 | 91 | Linux 3.19 introduced restrictions on writing a user namespace GID map as an 92 | unprivileged user unless setgroups() has been permanently disabled, which 93 | broke older versions of contain. Run non-setuid and unprivileged, contain 94 | and pseudo must now disable setgroups() to create containers, but if they 95 | are installed setuid, they will bypass this kernel restriction and leave 96 | setgroups() enabled in the resulting containers. 97 | -------------------------------------------------------------------------------- /console.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "contain.h" 17 | 18 | static struct termios saved; 19 | 20 | int getconsole(void) { 21 | int master, null; 22 | 23 | if ((null = open("/dev/null", O_RDWR)) < 0) 24 | errx(EXIT_FAILURE, "Failed to open /dev/null"); 25 | 26 | if (fcntl(STDIN_FILENO, F_GETFD) < 0) 27 | dup2(null, STDIN_FILENO); 28 | if (fcntl(STDOUT_FILENO, F_GETFD) < 0) 29 | dup2(null, STDOUT_FILENO); 30 | if (fcntl(STDERR_FILENO, F_GETFD) < 0) 31 | dup2(null, STDERR_FILENO); 32 | 33 | if (null != STDIN_FILENO) 34 | if (null != STDOUT_FILENO) 35 | if (null != STDERR_FILENO) 36 | close(null); 37 | 38 | if ((master = posix_openpt(O_RDWR | O_NOCTTY)) < 0) 39 | errx(EXIT_FAILURE, "Failed to allocate a console pseudo-terminal"); 40 | grantpt(master); 41 | unlockpt(master); 42 | return master; 43 | } 44 | 45 | static void rawmode(void) { 46 | struct termios termios; 47 | 48 | if (!isatty(STDIN_FILENO)) 49 | return; 50 | if (tcgetattr(STDIN_FILENO, &termios) < 0) 51 | err(EXIT_FAILURE, "tcgetattr"); 52 | cfmakeraw(&termios); 53 | tcsetattr(STDIN_FILENO, TCSANOW, &termios); 54 | } 55 | 56 | static void restoremode(void) { 57 | if (isatty(STDIN_FILENO)) 58 | tcsetattr(STDIN_FILENO, TCSANOW, &saved); 59 | } 60 | 61 | static void savemode(void) { 62 | if (isatty(STDIN_FILENO) && tcgetattr(STDIN_FILENO, &saved) < 0) 63 | err(EXIT_FAILURE, "tcgetattr"); 64 | } 65 | 66 | void setconsole(char *name) { 67 | int console; 68 | struct termios termios; 69 | 70 | setsid(); 71 | 72 | if ((console = open(name, O_RDWR)) < 0) 73 | errx(EXIT_FAILURE, "Failed to open console in container"); 74 | ioctl(console, TIOCSCTTY, NULL); 75 | 76 | if (tcgetattr(console, &termios) < 0) 77 | err(EXIT_FAILURE, "tcgetattr"); 78 | termios.c_iflag |= IGNBRK | IUTF8; 79 | tcsetattr(console, TCSANOW, &termios); 80 | 81 | dup2(console, STDIN_FILENO); 82 | dup2(console, STDOUT_FILENO); 83 | dup2(console, STDERR_FILENO); 84 | if (console != STDIN_FILENO) 85 | if (console != STDOUT_FILENO) 86 | if (console != STDERR_FILENO) 87 | close(console); 88 | } 89 | 90 | int supervise(pid_t child, int console) { 91 | char buffer[PIPE_BUF]; 92 | int signals, slave, status; 93 | sigset_t mask; 94 | ssize_t count, length, offset; 95 | struct pollfd fds[3]; 96 | 97 | if (console < 0) { 98 | if (waitpid(child, &status, 0) < 0) 99 | err(EXIT_FAILURE, "waitpid"); 100 | return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; 101 | } 102 | 103 | sigemptyset(&mask); 104 | sigaddset(&mask, SIGCHLD); 105 | sigprocmask(SIG_BLOCK, &mask, NULL); 106 | if ((signals = signalfd(-1, &mask, 0)) < 0) 107 | err(EXIT_FAILURE, "signalfd"); 108 | 109 | if (waitpid(child, &status, WNOHANG) > 0) 110 | if (WIFEXITED(status) || WIFSIGNALED(status)) 111 | raise(SIGCHLD); 112 | 113 | savemode(); 114 | atexit(restoremode); 115 | rawmode(); 116 | 117 | slave = open(ptsname(console), O_RDWR); 118 | 119 | fds[0].fd = console; 120 | fds[0].events = POLLIN; 121 | fds[1].fd = STDIN_FILENO; 122 | fds[1].events = POLLIN; 123 | fds[2].fd = signals; 124 | fds[2].events = POLLIN; 125 | 126 | while (1) { 127 | if (poll(fds, 3, -1) < 0) 128 | if (errno != EAGAIN && errno != EINTR) 129 | err(EXIT_FAILURE, "poll"); 130 | 131 | if (fds[0].revents & POLLIN) { 132 | if ((length = read(console, buffer, sizeof(buffer))) < 0) 133 | if (errno != EAGAIN && errno != EINTR) 134 | err(EXIT_FAILURE, "read"); 135 | for (offset = 0; length > 0; offset += count, length -= count) 136 | while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0) 137 | if (errno != EAGAIN && errno != EINTR) 138 | err(EXIT_FAILURE, "write"); 139 | } 140 | 141 | if (fds[1].revents & (POLLHUP | POLLIN)) { 142 | if ((length = read(STDIN_FILENO, buffer, sizeof(buffer))) == 0) 143 | fds[1].events = 0; 144 | else if (length < 0 && errno != EAGAIN && errno != EINTR) 145 | err(EXIT_FAILURE, "read"); 146 | for (offset = 0; length > 0; offset += count, length -= count) 147 | while ((count = write(console, buffer + offset, length)) < 0) 148 | if (errno != EAGAIN && errno != EINTR) 149 | err(EXIT_FAILURE, "write"); 150 | } 151 | 152 | if (fds[2].revents & POLLIN) { 153 | if (read(signals, buffer, sizeof(buffer)) < 0) 154 | if (errno != EAGAIN && errno != EINTR) 155 | err(EXIT_FAILURE, "read"); 156 | if (waitpid(child, &status, WNOHANG) > 0) 157 | if (WIFEXITED(status) || WIFSIGNALED(status)) 158 | break; 159 | } 160 | } 161 | 162 | close(signals); 163 | close(slave); 164 | 165 | while ((length = read(console, buffer, sizeof(buffer)))) { 166 | if (length < 0 && errno != EAGAIN && errno != EINTR) 167 | break; 168 | for (offset = 0; length > 0; offset += count, length -= count) 169 | while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0) 170 | if (errno != EAGAIN && errno != EINTR) 171 | err(EXIT_FAILURE, "write"); 172 | } 173 | 174 | return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; 175 | } 176 | -------------------------------------------------------------------------------- /contain.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "contain.h" 18 | 19 | static void usage(const char *progname) { 20 | fprintf(stderr, "\ 21 | Usage: %s [OPTIONS] DIR [CMD [ARG]...]\n\ 22 | Options:\n\ 23 | -c disable console emulation in the container\n\ 24 | -g MAP set the container-to-host GID map\n\ 25 | -i CMD run a helper child inside the new namespaces\n\ 26 | -n share the host network unprivileged in the container\n\ 27 | -o CMD run a helper child outside the new namespaces\n\ 28 | -u MAP set the container-to-host UID map\n\ 29 | GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\ 30 | ", progname); 31 | exit(EX_USAGE); 32 | } 33 | 34 | int main(int argc, char **argv) { 35 | char *gidmap = NULL, *inside = NULL, *outside = NULL, *uidmap = NULL; 36 | int hostnet = 0, master, option, stdio = 0; 37 | pid_t child, parent; 38 | 39 | while ((option = getopt(argc, argv, "+:cg:i:no:u:")) > 0) 40 | switch (option) { 41 | case 'c': 42 | stdio++; 43 | break; 44 | case 'g': 45 | gidmap = optarg; 46 | break; 47 | case 'i': 48 | inside = optarg; 49 | break; 50 | case 'n': 51 | hostnet++; 52 | break; 53 | case 'o': 54 | outside = optarg; 55 | break; 56 | case 'u': 57 | uidmap = optarg; 58 | break; 59 | default: 60 | usage(argv[0]); 61 | } 62 | 63 | if (argc <= optind) 64 | usage(argv[0]); 65 | 66 | parent = getpid(); 67 | switch (child = fork()) { 68 | case -1: 69 | err(EXIT_FAILURE, "fork"); 70 | case 0: 71 | raise(SIGSTOP); 72 | if (geteuid() != 0) 73 | denysetgroups(parent); 74 | writemap(parent, GID, gidmap); 75 | writemap(parent, UID, uidmap); 76 | 77 | if (outside) { 78 | if (setgid(getgid()) < 0 || setuid(getuid()) < 0) 79 | errx(EXIT_FAILURE, "Failed to drop privileges"); 80 | prctl(PR_SET_DUMPABLE, 1); 81 | execlp(SHELL, SHELL, "-c", outside, NULL); 82 | err(EXIT_FAILURE, "exec %s", outside); 83 | } 84 | 85 | exit(EXIT_SUCCESS); 86 | } 87 | 88 | if (setgid(getgid()) < 0 || setuid(getuid()) < 0) 89 | errx(EXIT_FAILURE, "Failed to drop privileges"); 90 | prctl(PR_SET_DUMPABLE, 1); 91 | 92 | if (unshare(CLONE_NEWUSER) < 0) 93 | errx(EXIT_FAILURE, "Failed to unshare user namespace"); 94 | 95 | #ifdef CLONE_NEWCGROUP 96 | if (unshare(CLONE_NEWCGROUP) < 0) 97 | errx(EXIT_FAILURE, "Failed to unshare cgroup namespace"); 98 | #endif 99 | 100 | if (unshare(CLONE_NEWIPC) < 0) 101 | errx(EXIT_FAILURE, "Failed to unshare IPC namespace"); 102 | 103 | if (!hostnet && unshare(CLONE_NEWNET) < 0) 104 | errx(EXIT_FAILURE, "Failed to unshare network namespace"); 105 | 106 | if (unshare(CLONE_NEWNS) < 0) 107 | errx(EXIT_FAILURE, "Failed to unshare mount namespace"); 108 | 109 | #ifdef CLONE_NEWTIME 110 | if (unshare(CLONE_NEWTIME) < 0) 111 | errx(EXIT_FAILURE, "Failed to unshare time namespace"); 112 | #endif 113 | 114 | if (unshare(CLONE_NEWUTS) < 0) 115 | errx(EXIT_FAILURE, "Failed to unshare UTS namespace"); 116 | 117 | waitforstop(child); 118 | kill(child, SIGCONT); 119 | waitforexit(child); 120 | 121 | setgid(0); 122 | setgroups(0, NULL); 123 | setuid(0); 124 | 125 | master = stdio ? -1 : getconsole(); 126 | createroot(argv[optind], master, inside); 127 | 128 | if (unshare(CLONE_NEWPID) < 0) 129 | errx(EXIT_FAILURE, "Failed to unshare PID namespace"); 130 | 131 | switch (child = fork()) { 132 | case -1: 133 | err(EXIT_FAILURE, "fork"); 134 | case 0: 135 | mountproc(); 136 | if (!hostnet) 137 | mountsys(); 138 | enterroot(); 139 | 140 | if (master >= 0) { 141 | close(master); 142 | setconsole("/dev/console"); 143 | } 144 | 145 | clearenv(); 146 | putenv("container=contain"); 147 | 148 | if (argv[optind + 1]) 149 | execv(argv[optind + 1], argv + optind + 1); 150 | else 151 | execl(SHELL, SHELL, NULL); 152 | err(EXIT_FAILURE, "exec"); 153 | } 154 | 155 | return supervise(child, master); 156 | } 157 | -------------------------------------------------------------------------------- /contain.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTAIN_H 2 | #define CONTAIN_H 3 | 4 | #define GID 0 5 | #define UID 1 6 | #define INVALID ((unsigned) -1) 7 | #define SHELL "/bin/sh" 8 | 9 | #define getid(type) ((unsigned) ((type) == GID ? getgid() : getuid())) 10 | #define idfile(type) ((type) == GID ? "gid_map" : "uid_map") 11 | #define idname(type) ((type) == GID ? "GID" : "UID") 12 | #define subpath(type) ((type) == GID ? "/etc/subgid" : "/etc/subuid") 13 | 14 | char *append(char **destination, const char *format, ...); 15 | void createroot(char *src, int console, char *helper); 16 | void denysetgroups(pid_t pid); 17 | void enterroot(void); 18 | int getconsole(void); 19 | void mountproc(void); 20 | void mountsys(void); 21 | void seal(char **argv, char **envp); 22 | void setconsole(char *name); 23 | char *string(const char *format, ...); 24 | int supervise(pid_t child, int console); 25 | char *tmpdir(void); 26 | void waitforstop(pid_t child); 27 | void waitforexit(pid_t child); 28 | void writemap(pid_t pid, int type, char *map); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /inject.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "contain.h" 17 | 18 | static int getparent(pid_t child) { 19 | char *end, *line = NULL, *path, *start; 20 | pid_t parent = -1; 21 | size_t size; 22 | FILE *file; 23 | 24 | path = string("/proc/%u/stat", child); 25 | file = fopen(path, "r"); 26 | free(path); 27 | 28 | if (file && getline(&line, &size, file) >= 0) 29 | /* "PID (NAME) S PPID ...", so PPID begins 4 chars after the last ')' */ 30 | if ((start = strrchr(line, ')')) && strlen(start) >= 4) { 31 | parent = strtol(start + 4, &end, 10); 32 | if (end == start || *end != ' ') 33 | parent = -1; 34 | } 35 | 36 | if (file) 37 | fclose(file); 38 | if (line) 39 | free(line); 40 | 41 | return parent; 42 | } 43 | 44 | static void join(pid_t pid, char *type) { 45 | char *path; 46 | int fd; 47 | 48 | path = string("/proc/%u/ns/%s", pid, type); 49 | 50 | if ((fd = open(path, O_RDONLY)) >= 0) { 51 | if (syscall(__NR_setns, fd, 0) < 0 && strcmp(type, "user") == 0) 52 | errx(EXIT_FAILURE, "Failed to join user namespace"); 53 | close(fd); 54 | } else if (errno != ENOENT) { 55 | errx(EXIT_FAILURE, "PID %u does not belong to you", pid); 56 | } else if (strcmp(type, "user") == 0) { 57 | errx(EXIT_FAILURE, "PID %u not found or user namespace unavailable", pid); 58 | } 59 | 60 | free(path); 61 | } 62 | 63 | static void usage(const char *progname) { 64 | fprintf(stderr, "Usage: %s PID [CMD [ARG]...]\n", progname); 65 | exit(64); 66 | } 67 | 68 | int main(int argc, char **argv, char **envp) { 69 | char *end, *item = NULL, *path; 70 | pid_t child = -1, parent, pid; 71 | size_t size; 72 | struct dirent *entry; 73 | DIR *dir; 74 | FILE *file; 75 | 76 | seal(argv, envp); 77 | if (argc < 2) 78 | usage(argv[0]); 79 | 80 | parent = strtol(argv[1], &end, 10); 81 | if (end == argv[1] || *end) 82 | usage(argv[0]); 83 | 84 | if (geteuid() != getuid()) 85 | errx(EXIT_FAILURE, "setuid installation is unsafe"); 86 | else if (getegid() != getgid()) 87 | errx(EXIT_FAILURE, "setgid installation is unsafe"); 88 | 89 | join(parent, "user"); 90 | setgid(0); 91 | setgroups(0, NULL); 92 | setuid(0); 93 | 94 | if (!(dir = opendir("/proc"))) 95 | errx(EXIT_FAILURE, "Failed to list processes"); 96 | while (child < 0 && (entry = readdir(dir))) { 97 | pid = strtol(entry->d_name, &end, 10); 98 | if (end == entry->d_name || *end) 99 | continue; 100 | if (getparent(pid) == parent) { 101 | path = string("/proc/%u/environ", pid); 102 | if ((file = fopen(path, "r"))) { 103 | while (getdelim(&item, &size, '\0', file) >= 0) 104 | if (strcmp(item, "container=contain") == 0) 105 | child = pid; 106 | fclose(file); 107 | } 108 | free(path); 109 | } 110 | } 111 | closedir(dir); 112 | if (item) 113 | free(item); 114 | 115 | if (child < 0) 116 | errx(EXIT_FAILURE, "PID %u is not a container supervisor", parent); 117 | 118 | join(child, "cgroup"); 119 | join(child, "ipc"); 120 | join(child, "net"); 121 | join(child, "pid"); 122 | join(child, "time"); 123 | join(child, "uts"); 124 | join(child, "mnt"); 125 | 126 | if (chdir("/") < 0) 127 | errx(EXIT_FAILURE, "Failed to enter container root directory"); 128 | 129 | switch (child = fork()) { 130 | case -1: 131 | err(EXIT_FAILURE, "fork"); 132 | case 0: 133 | if (argv[2]) 134 | execvp(argv[2], argv + 2); 135 | else if (getenv("SHELL")) 136 | execl(getenv("SHELL"), getenv("SHELL"), NULL); 137 | else 138 | execl(SHELL, SHELL, NULL); 139 | err(EXIT_FAILURE, "exec"); 140 | } 141 | 142 | waitforexit(child); 143 | return EXIT_SUCCESS; 144 | } 145 | -------------------------------------------------------------------------------- /map.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "contain.h" 13 | 14 | void denysetgroups(pid_t pid) { 15 | char *path, *text = "deny"; 16 | int fd; 17 | 18 | path = string("/proc/%d/setgroups", pid); 19 | if ((fd = open(path, O_WRONLY)) < 0) 20 | errx(EXIT_FAILURE, "Failed to disable setgroups() in container"); 21 | else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) 22 | errx(EXIT_FAILURE, "Failed to disable setgroups() in container"); 23 | close(fd); 24 | free(path); 25 | } 26 | 27 | static char *getmap(pid_t pid, int type) { 28 | char *line = NULL, *result = NULL, *path; 29 | size_t size; 30 | unsigned count, first, lower; 31 | FILE *file; 32 | 33 | if (pid == -1) 34 | path = string("/proc/self/%s", idfile(type)); 35 | else 36 | path = string("/proc/%d/%s", pid, idfile(type)); 37 | if (!(file = fopen(path, "r"))) 38 | errx(EXIT_FAILURE, "Cannot read %s", path); 39 | 40 | while (getline(&line, &size, file) >= 0) { 41 | if (sscanf(line, " %u %u %u", &first, &lower, &count) != 3) 42 | errx(EXIT_FAILURE, "Invalid map data in %s", path); 43 | append(&result, "%s%u:%u:%u", result ? "," : "", first, lower, count); 44 | } 45 | 46 | if (!result) 47 | errx(EXIT_FAILURE, "Invalid map data in %s", path); 48 | 49 | fclose(file); 50 | free(line); 51 | free(path); 52 | return result; 53 | } 54 | 55 | static char *mapitem(char *map, unsigned *first, unsigned *lower, 56 | unsigned *count) { 57 | ssize_t skip; 58 | 59 | while (map && *map && strchr(",;", *map)) 60 | map++; 61 | if (map == NULL || *map == '\0') 62 | return NULL; 63 | if (sscanf(map, "%u:%u:%u%zn", first, lower, count, &skip) < 3) 64 | errx(EXIT_FAILURE, "Invalid ID map '%s'", map); 65 | return map + skip; 66 | } 67 | 68 | static char *rangeitem(char *range, unsigned *start, unsigned *length) { 69 | ssize_t skip; 70 | 71 | while (range && *range && strchr(",;", *range)) 72 | range++; 73 | if (range == NULL || *range == '\0') 74 | return NULL; 75 | if (sscanf(range, "%u:%u%zn", start, length, &skip) < 2) 76 | errx(EXIT_FAILURE, "Invalid ID range '%s'", range); 77 | return range + skip; 78 | } 79 | 80 | static char *readranges(int type) { 81 | char *line = NULL, *entry, *range, *user; 82 | size_t end, size; 83 | struct passwd *passwd; 84 | uid_t uid; 85 | unsigned int length, start; 86 | FILE *file; 87 | 88 | range = string("%u:1", getid(type)); 89 | if (!(file = fopen(subpath(type), "r"))) 90 | return range; 91 | 92 | uid = getuid(); 93 | user = getenv("USER"); 94 | user = user ? user : getenv("LOGNAME"); 95 | user = user ? user : getlogin(); 96 | if (!user || !(passwd = getpwnam(user)) || passwd->pw_uid != uid) { 97 | if (!(passwd = getpwuid(uid))) 98 | errx(EXIT_FAILURE, "Failed to validate your username"); 99 | user = passwd->pw_name; 100 | } 101 | endpwent(); 102 | 103 | while (getline(&line, &size, file) >= 0) { 104 | if (strtol(line, &entry, 10) != uid || entry == line) { 105 | if (strncmp(line, user, strlen(user))) 106 | continue; 107 | entry = line + strlen(user); 108 | } 109 | if (sscanf(entry, ":%u:%u%zn", &start, &length, &end) < 2) 110 | continue; 111 | if (strchr(":\n", entry[end + 1])) 112 | append(&range, ",%u:%u", start, length); 113 | } 114 | 115 | free(line); 116 | fclose(file); 117 | return range; 118 | } 119 | 120 | static char *rootdefault(int type) { 121 | char *cursor, *map, *result; 122 | unsigned count, first, last = INVALID, lower; 123 | 124 | cursor = map = getmap(-1, type); 125 | while ((cursor = mapitem(cursor, &first, &lower, &count))) 126 | if (last == INVALID || last < first + count - 1) 127 | last = first + count - 1; 128 | result = string("0:%u:1", last); 129 | 130 | cursor = map; 131 | while ((cursor = mapitem(cursor, &first, &lower, &count))) { 132 | if (first == 0) { 133 | if (count == 1 && first >= last) 134 | errx(EXIT_FAILURE, "No unprivileged %s available\n", idname(type)); 135 | first++, lower++, count--; 136 | } 137 | 138 | if (last <= first + count - 1 && count > 0) 139 | count--; 140 | 141 | if (count > 0) 142 | append(&result, "%s%u:%u:%u", result ? "," : "", first, first, count); 143 | } 144 | 145 | free(map); 146 | return result; 147 | } 148 | 149 | static char *userdefault(int type) { 150 | char *cursor, *map, *range, *result = NULL; 151 | unsigned count, first, index = 0, length, lower, start; 152 | 153 | if (geteuid() != 0) 154 | return string("0:%u:1", getid(type)); 155 | 156 | map = getmap(-1, type); 157 | range = readranges(type); 158 | 159 | while ((range = rangeitem(range, &start, &length))) { 160 | cursor = map; 161 | while ((cursor = mapitem(cursor, &first, &lower, &count))) { 162 | if (start + length <= first || first + count <= start) 163 | continue; 164 | if (first + count < start + length) 165 | length = start - first + count; 166 | if (start < first) { 167 | index += first - start; 168 | length -= first - start; 169 | start = first; 170 | } 171 | append(&result, "%s%u:%u:%u", result ? "," : "", index, start, length); 172 | index += length; 173 | } 174 | } 175 | 176 | free(map); 177 | free(range); 178 | return result; 179 | } 180 | 181 | static void validate(char *range, unsigned first, unsigned count) { 182 | unsigned length, start; 183 | 184 | while ((range = rangeitem(range, &start, &length))) 185 | if (first < start + length && start < first + count) { 186 | if (first < start) 187 | validate(range, first, start - first); 188 | if (first + count > start + length) 189 | validate(range, start + length, first + count - start - length); 190 | return; 191 | } 192 | errx(EXIT_FAILURE, "Cannot map onto IDs that are not delegated to you"); 193 | } 194 | 195 | static void verifymap(char *map, char *range) { 196 | unsigned count, first, lower; 197 | 198 | while ((map = mapitem(map, &first, &lower, &count))) 199 | validate(range, lower, count); 200 | } 201 | 202 | void writemap(pid_t pid, int type, char *map) { 203 | char *path, *range, *text = NULL; 204 | int fd; 205 | unsigned count, first, lower; 206 | 207 | if (!map) { 208 | map = (getuid() == 0 ? rootdefault : userdefault)(type); 209 | } else if (getuid() != 0) { 210 | range = readranges(type); 211 | verifymap(map, range); 212 | free(range); 213 | } 214 | 215 | while ((map = mapitem(map, &first, &lower, &count))) 216 | append(&text, "%u %u %u\n", first, lower, count); 217 | 218 | path = string("/proc/%d/%s", pid, idfile(type)); 219 | if ((fd = open(path, O_WRONLY)) < 0) 220 | errx(EXIT_FAILURE, "Failed to set container %s map", idname(type)); 221 | else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) 222 | errx(EXIT_FAILURE, "Failed to set container %s map", idname(type)); 223 | 224 | close(fd); 225 | free(path); 226 | free(text); 227 | } 228 | -------------------------------------------------------------------------------- /mount.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "contain.h" 12 | 13 | static char *root; 14 | 15 | static void bindnode(char *src, char *dst) { 16 | int fd; 17 | 18 | if ((fd = open(dst, O_WRONLY | O_CREAT, 0600)) >= 0) 19 | close(fd); 20 | if (mount(src, dst, NULL, MS_BIND, NULL) < 0) 21 | errx(EXIT_FAILURE, "Failed to bind %s into new /dev filesystem", src); 22 | } 23 | 24 | static void cleanup(void) { 25 | if (root) { 26 | umount2(root, MNT_DETACH); 27 | rmdir(root); 28 | } 29 | } 30 | 31 | void createroot(char *src, int console, char *helper) { 32 | mode_t mask; 33 | pid_t child; 34 | 35 | root = tmpdir(); 36 | atexit(cleanup); 37 | 38 | if (mount(src, root, NULL, MS_BIND | MS_REC, NULL) < 0) 39 | errx(EXIT_FAILURE, "Failed to bind new root filesystem"); 40 | else if (chdir(root) < 0) 41 | errx(EXIT_FAILURE, "Failed to enter new root filesystem"); 42 | 43 | mask = umask(0); 44 | mkdir("dev" , 0755); 45 | if (mount("tmpfs", "dev", "tmpfs", 0, "mode=0755") < 0) 46 | errx(EXIT_FAILURE, "Failed to mount /dev tmpfs in new root filesystem"); 47 | 48 | mkdir("dev/pts", 0755); 49 | if (mount("devpts", "dev/pts", "devpts", 0, "newinstance,ptmxmode=666") < 0) 50 | errx(EXIT_FAILURE, "Failed to mount /dev/pts in new root filesystem"); 51 | 52 | mkdir("dev/tmp", 0755); 53 | umask(mask); 54 | 55 | if (console >= 0) 56 | bindnode(ptsname(console), "dev/console"); 57 | bindnode("/dev/full", "dev/full"); 58 | bindnode("/dev/null", "dev/null"); 59 | bindnode("/dev/random", "dev/random"); 60 | bindnode("/dev/tty", "dev/tty"); 61 | bindnode("/dev/urandom", "dev/urandom"); 62 | bindnode("/dev/zero", "dev/zero"); 63 | symlink("pts/ptmx", "dev/ptmx"); 64 | 65 | if (helper) 66 | switch (child = fork()) { 67 | case -1: 68 | err(EXIT_FAILURE, "fork"); 69 | case 0: 70 | execlp(SHELL, SHELL, "-c", helper, NULL); 71 | err(EXIT_FAILURE, "exec %s", helper); 72 | default: 73 | waitforexit(child); 74 | } 75 | } 76 | 77 | void enterroot(void) { 78 | if (syscall(__NR_pivot_root, ".", "dev/tmp") < 0) 79 | errx(EXIT_FAILURE, "Failed to pivot into new root filesystem"); 80 | 81 | if (chdir("/dev/tmp") >= 0) { 82 | while (*root == '/') 83 | root++; 84 | rmdir(root); 85 | } 86 | 87 | root = NULL; 88 | 89 | if (chdir("/") < 0 || umount2("/dev/tmp", MNT_DETACH) < 0) 90 | errx(EXIT_FAILURE, "Failed to detach old root filesystem"); 91 | else 92 | rmdir("/dev/tmp"); 93 | } 94 | 95 | void mountproc(void) { 96 | mode_t mask; 97 | 98 | mask = umask(0); 99 | mkdir("proc" , 0755); 100 | umask(mask); 101 | 102 | if (mount("proc", "proc", "proc", 0, NULL) < 0) 103 | errx(EXIT_FAILURE, "Failed to mount /proc in new root filesystem"); 104 | } 105 | 106 | void mountsys(void) { 107 | mode_t mask; 108 | 109 | mask = umask(0); 110 | mkdir("sys" , 0755); 111 | umask(mask); 112 | 113 | if (mount("sysfs", "sys", "sysfs", 0, NULL) < 0) 114 | errx(EXIT_FAILURE, "Failed to mount /sys in new root filesystem"); 115 | mount("cgroup2", "sys/fs/cgroup", "cgroup2", 0, NULL); 116 | } 117 | -------------------------------------------------------------------------------- /pseudo.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "contain.h" 13 | 14 | static void usage(const char *progname) { 15 | fprintf(stderr, "\ 16 | Usage: %s [OPTIONS] [CMD [ARG]...]\n\ 17 | Options:\n\ 18 | -g MAP set the user namespace GID map\n\ 19 | -u MAP set the user namespace UID map\n\ 20 | GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\ 21 | ", progname); 22 | exit(EX_USAGE); 23 | } 24 | 25 | int main(int argc, char **argv) { 26 | char *gidmap = NULL, *uidmap = NULL; 27 | int option; 28 | pid_t child, parent; 29 | 30 | while ((option = getopt(argc, argv, "+:g:u:")) > 0) 31 | switch (option) { 32 | case 'g': 33 | gidmap = optarg; 34 | break; 35 | case 'u': 36 | uidmap = optarg; 37 | break; 38 | default: 39 | usage(argv[0]); 40 | } 41 | 42 | parent = getpid(); 43 | switch (child = fork()) { 44 | case -1: 45 | err(EXIT_FAILURE, "fork"); 46 | case 0: 47 | raise(SIGSTOP); 48 | if (geteuid() != 0) 49 | denysetgroups(parent); 50 | writemap(parent, GID, gidmap); 51 | writemap(parent, UID, uidmap); 52 | exit(0); 53 | } 54 | 55 | if (setgid(getgid()) < 0 || setuid(getuid()) < 0) 56 | errx(EXIT_FAILURE, "Failed to drop privileges"); 57 | prctl(PR_SET_DUMPABLE, 1); 58 | 59 | if (unshare(CLONE_NEWUSER) < 0) 60 | errx(EXIT_FAILURE, "Failed to unshare user namespace"); 61 | 62 | waitforstop(child); 63 | kill(child, SIGCONT); 64 | waitforexit(child); 65 | 66 | setgid(0); 67 | setgroups(0, NULL); 68 | setuid(0); 69 | 70 | if (argv[optind]) 71 | execvp(argv[optind], argv + optind); 72 | else if (getenv("SHELL")) 73 | execl(getenv("SHELL"), getenv("SHELL"), NULL); 74 | else 75 | execl(SHELL, SHELL, NULL); 76 | 77 | err(EXIT_FAILURE, "exec"); 78 | return EXIT_FAILURE; 79 | } 80 | -------------------------------------------------------------------------------- /util.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "contain.h" 15 | 16 | char *append(char **destination, const char *format, ...) { 17 | char *extra, *result; 18 | va_list args; 19 | 20 | va_start(args, format); 21 | if (vasprintf(&extra, format, args) < 0) 22 | err(EXIT_FAILURE, "asprintf"); 23 | va_end(args); 24 | 25 | if (*destination == NULL) { 26 | *destination = extra; 27 | return extra; 28 | } 29 | 30 | if (asprintf(&result, "%s%s", *destination, extra) < 0) 31 | err(EXIT_FAILURE, "asprintf"); 32 | free(*destination); 33 | free(extra); 34 | *destination = result; 35 | return result; 36 | } 37 | 38 | void seal(char **argv, char **envp) { 39 | const int seals = F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE; 40 | int dst, src; 41 | ssize_t length; 42 | 43 | if ((src = open("/proc/self/exe", O_RDONLY)) < 0) 44 | err(EXIT_FAILURE, "open /proc/self/exe"); 45 | if (fcntl(src, F_GET_SEALS) == seals) { 46 | close(src); 47 | return; 48 | } 49 | 50 | dst = memfd_create("/proc/self/exe", MFD_CLOEXEC | MFD_ALLOW_SEALING); 51 | if (dst < 0) 52 | err(EXIT_FAILURE, "memfd_create"); 53 | 54 | while (length = sendfile(dst, src, NULL, BUFSIZ), length != 0) 55 | if (length < 0 && errno != EAGAIN && errno != EINTR) 56 | err(EXIT_FAILURE, "sendfile"); 57 | close(src); 58 | 59 | if (fcntl(dst, F_ADD_SEALS, seals) < 0) 60 | err(EXIT_FAILURE, "fcntl F_ADD_SEALS"); 61 | fexecve(dst, argv, envp); 62 | err(EXIT_FAILURE, "fexecve"); 63 | } 64 | 65 | char *string(const char *format, ...) { 66 | char *result; 67 | va_list args; 68 | 69 | va_start(args, format); 70 | if (vasprintf(&result, format, args) < 0) 71 | err(EXIT_FAILURE, "asprintf"); 72 | va_end(args); 73 | return result; 74 | } 75 | 76 | char *tmpdir(void) { 77 | char *dir; 78 | 79 | if (!(dir = strdup("/tmp/XXXXXX"))) 80 | err(EXIT_FAILURE, "strdup"); 81 | else if (!mkdtemp(dir)) 82 | errx(EXIT_FAILURE, "Failed to create temporary directory"); 83 | return dir; 84 | } 85 | 86 | void waitforexit(pid_t child) { 87 | int status; 88 | 89 | if (waitpid(child, &status, 0) < 0) 90 | err(EXIT_FAILURE, "waitpid"); 91 | else if (WEXITSTATUS(status) != EXIT_SUCCESS) 92 | exit(WEXITSTATUS(status)); 93 | } 94 | 95 | void waitforstop(pid_t child) { 96 | int status; 97 | 98 | if (waitpid(child, &status, WUNTRACED) < 0) 99 | err(EXIT_FAILURE, "waitpid"); 100 | if (!WIFSTOPPED(status)) 101 | exit(WEXITSTATUS(status)); 102 | } 103 | --------------------------------------------------------------------------------