├── systemd
    ├── isolate.slice
    └── isolate.service.in
├── debian
    ├── build
    │   ├── do-test
    │   ├── container-test
    │   │   ├── jammy
    │   │   ├── noble
    │   │   ├── bookworm
    │   │   └── trixie
    │   ├── container-build
    │   │   ├── jammy
    │   │   ├── noble
    │   │   ├── trixie
    │   │   └── bookworm
    │   ├── do-build
    │   └── run
    ├── isolate.lintian-overrides.trixie
    ├── isolate.lintian-overrides
    ├── copyright
    ├── isolate.postrm
    ├── isolate.postinst
    ├── rules
    ├── rules.trixie
    ├── control
    └── changelog
├── .gitignore
├── TODO
├── .travis.yml
├── isolate-cg-keeper.8.txt
├── LICENSE
├── default.cf.in
├── isolate-check-environment.8.txt
├── README.md
├── NEWS
├── isolate.h
├── isolate-cg-keeper.c
├── config.c
├── Makefile
├── cg.c
├── util.c
├── isolate-check-environment
├── rules.c
├── isolate.1.txt
└── isolate.c


/systemd/isolate.slice:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=Slice for Isolate's sandboxes
3 | 


--------------------------------------------------------------------------------
/debian/build/do-test:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | apt install -y ./isolate_*.deb
4 | 


--------------------------------------------------------------------------------
/debian/isolate.lintian-overrides.trixie:
--------------------------------------------------------------------------------
1 | elevated-privileges
2 | unknown-section
3 | 


--------------------------------------------------------------------------------
/debian/build/container-test/jammy:
--------------------------------------------------------------------------------
1 | FROM ubuntu:jammy
2 | 
3 | RUN apt update && apt upgrade -y
4 | 


--------------------------------------------------------------------------------
/debian/build/container-test/noble:
--------------------------------------------------------------------------------
1 | FROM ubuntu:noble
2 | 
3 | RUN apt update && apt upgrade -y
4 | 


--------------------------------------------------------------------------------
/debian/build/container-test/bookworm:
--------------------------------------------------------------------------------
1 | FROM debian:bookworm
2 | 
3 | RUN apt update && apt upgrade -y
4 | 


--------------------------------------------------------------------------------
/debian/build/container-test/trixie:
--------------------------------------------------------------------------------
1 | FROM debian:trixie
2 | 
3 | RUN apt update && apt upgrade -y
4 | 


--------------------------------------------------------------------------------
/debian/isolate.lintian-overrides:
--------------------------------------------------------------------------------
1 | elevated-privileges
2 | unknown-section
3 | package-supports-alternative-init-but-no-init.d-script
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | debian/build/build-tmp
 2 | default.cf
 3 | docbook-xsl.css
 4 | isolate
 5 | *.[18]
 6 | *.[18].html
 7 | isolate-cg-keeper
 8 | systemd/isolate.service
 9 | *.o
10 | 


--------------------------------------------------------------------------------
/debian/build/container-build/jammy:
--------------------------------------------------------------------------------
1 | FROM ubuntu:jammy
2 | 
3 | RUN apt update && apt upgrade -y
4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian
5 | 


--------------------------------------------------------------------------------
/debian/build/container-build/noble:
--------------------------------------------------------------------------------
1 | FROM ubuntu:noble
2 | 
3 | RUN apt update && apt upgrade -y
4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian
5 | 


--------------------------------------------------------------------------------
/debian/build/container-build/trixie:
--------------------------------------------------------------------------------
1 | FROM debian:trixie
2 | 
3 | RUN apt update && apt upgrade -y
4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian
5 | 


--------------------------------------------------------------------------------
/debian/build/container-build/bookworm:
--------------------------------------------------------------------------------
1 | FROM debian:bookworm
2 | 
3 | RUN apt update && apt upgrade -y
4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian
5 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | - Make --inherit-fds accept a list of fd's to inherit
2 | 
3 | - use /etc/subuid for the UID range
4 | - but still allow to configure maximum number of sandboxes
5 |   less than the size of the range, so that CPU / node
6 |   restrictions cannot be bypassed
7 | 


--------------------------------------------------------------------------------
/systemd/isolate.service.in:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=A trivial daemon to keep Isolate's control group hierarchy
 3 | 
 4 | [Service]
 5 | Type=notify
 6 | ExecStart=@SBINDIR@/isolate-cg-keeper
 7 | Slice=isolate.slice
 8 | Delegate=true
 9 | 
10 | [Install]
11 | WantedBy=multi-user.target
12 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
 1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 2 | Source: https://github.com/ioi/isolate
 3 | Upstream-Name: Isolate
 4 | Upstream-Contact: Martin Mareš <mj@ucw.cz>
 5 | 
 6 | Files:
 7 |  *
 8 | Copyright: 2012-2024 Martin Mareš and Bernard Blackham
 9 | License: GPL-2+
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | 
 3 | compiler: gcc
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - asciidoc
 9 |       - libcap-dev
10 |       - libxml2-utils
11 |       - xsltproc
12 |       - docbook-xml
13 |       - docbook-xsl
14 | 
15 | script:
16 |   - make DESTDIR=/tmp/isolate
17 |   - make DESTDIR=/tmp/isolate install
18 | 


--------------------------------------------------------------------------------
/debian/isolate.postrm:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | case "$1" in
 5 | remove)
 6 | 	if getent group isolate >/dev/null ; then
 7 | 		echo "Removing group isolate"
 8 | 		delgroup --quiet --only-if-empty isolate
 9 | 	fi
10 | 	if dpkg-statoverride --list /usr/bin/isolate >/dev/null 2>&1 ; then
11 | 		dpkg-statoverride --remove /usr/bin/isolate
12 | 	fi
13 | ;;
14 | esac
15 | 
16 | #DEBHELPER#
17 | 
18 | exit 0
19 | 


--------------------------------------------------------------------------------
/debian/isolate.postinst:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | case "$1" in
 5 | configure)
 6 | 	if ! getent group isolate >/dev/null ; then
 7 | 		echo "Adding new group isolate"
 8 | 		addgroup --quiet --system isolate
 9 | 	fi
10 | 	dpkg-statoverride --list /usr/bin/isolate >/dev/null 2>&1 ||
11 | 		dpkg-statoverride --update --add root isolate 4754 /usr/bin/isolate
12 | ;;
13 | esac
14 | 
15 | #DEBHELPER#
16 | 
17 | exit 0
18 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | # export DH_VERBOSE = 1
 4 | 
 5 | %:
 6 | 	dh $@
 7 | 
 8 | override_dh_auto_build:
 9 | 	make all PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc CFLAGS_EXTRA=-g
10 | 
11 | override_dh_auto_install:
12 | 	make install install-doc PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc LIBDIR=/lib DESTDIR=debian/isolate
13 | 
14 | override_dh_fixperms:
15 | 	dh_fixperms --exclude usr/bin/isolate
16 | 
17 | override_dh_installsystemd:
18 | 	dh_installsystemd isolate.service
19 | 


--------------------------------------------------------------------------------
/debian/rules.trixie:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | # export DH_VERBOSE = 1
 4 | 
 5 | %:
 6 | 	dh $@
 7 | 
 8 | override_dh_auto_build:
 9 | 	make all PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc CFLAGS_EXTRA=-g
10 | 
11 | override_dh_auto_install:
12 | 	make install install-doc PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc LIBDIR=/usr/lib DESTDIR=debian/isolate
13 | 
14 | override_dh_fixperms:
15 | 	dh_fixperms --exclude usr/bin/isolate
16 | 
17 | override_dh_installsystemd:
18 | 	dh_installsystemd isolate.service
19 | 


--------------------------------------------------------------------------------
/debian/build/do-build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | cd isolate
 4 | . /etc/os-release
 5 | for fixup in $(find debian -name "*.$VERSION_CODENAME") ; do
 6 | 	echo "Applying fixup: $fixup"
 7 | 	mv $fixup ${fixup%.$VERSION_CODENAME}
 8 | done
 9 | if [ "$ID" = debian ] ; then
10 | 	# We currently run linitian only on Debian
11 | 	dpkg-buildpackage -b -uc -tc --check-command=lintian --check-option=-i --check-option=--fail-on=warning,error --check-option=--allow-root
12 | else
13 | 	dpkg-buildpackage -b -uc -tc
14 | fi
15 | 


--------------------------------------------------------------------------------
/isolate-cg-keeper.8.txt:
--------------------------------------------------------------------------------
 1 | ISOLATE-CG-KEEPER(8)
 2 | ====================
 3 | 
 4 | NAME
 5 | ----
 6 | isolate-cg-keeper - A helper daemon for keeping cgroups alive
 7 | 
 8 | SYNOPSIS
 9 | --------
10 | *isolate-cg-keeper*
11 | 
12 | DESCRIPTION
13 | -----------
14 | To use *isolate*(1) on a system with *systemd*(1), it is necessary to have a subtree of the
15 | control group tree delegated to Isolate.
16 | 
17 | This is accomplished by running a service (`isolate.service`) that keeps alive a slice (`isolate.slice`)
18 | using the *isolate-cg-keeper* daemon.
19 | 
20 | SEE ALSO
21 | --------
22 | *isolate*(1)
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Isolate is free software: you can redistribute it and/or modify
 2 | it under the terms of the GNU General Public License as published by
 3 | the Free Software Foundation, either version 2 of the License, or
 4 | (at your option) any later version.
 5 | 
 6 | This program is distributed in the hope that it will be useful,
 7 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 8 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 9 | GNU General Public License for more details.
10 | 
11 | If you have less than 10 copies of the GPL on your system :-),
12 | you can find it at http://www.gnu.org/licenses/.
13 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: isolate
 2 | Section: ucw
 3 | Priority: optional
 4 | Maintainer: Martin Mareš <mj@ucw.cz>
 5 | Standards-Version: 4.6.0.1
 6 | Build-Depends: debhelper (>= 13.0), debhelper-compat (= 13),
 7 | 	pkg-config, libcap-dev, libsystemd-dev, asciidoc
 8 | 
 9 | Package: isolate
10 | Architecture: any
11 | Description: Sandbox for programming contests
12 |  Isolate is a sandbox built to safely run untrusted executables, like
13 |  programs submitted by competitors in a programming contest. Isolate
14 |  gives them a limited-access environment, preventing them from affecting
15 |  the host system. It takes advantage of features specific to the Linux
16 |  kernel, like namespaces and control groups.
17 | Depends: adduser, ${shlibs:Depends}
18 | 


--------------------------------------------------------------------------------
/default.cf.in:
--------------------------------------------------------------------------------
 1 | # This is a configuration file for Isolate
 2 | 
 3 | # All sandboxes are created under this directory.
 4 | # To avoid symlink attacks, this directory and all its ancestors
 5 | # must be writeable only to root.
 6 | box_root = @BOXDIR@
 7 | 
 8 | # Directory where lock files are created.
 9 | lock_root = /run/isolate/locks
10 | 
11 | # Control group under which we place our subgroups
12 | # Either an explicit path to a subdirectory in cgroupfs, or "auto:file" to read
13 | # the path from "file", where it is put by isolate-cg-helper.
14 | # cg_root = /sys/fs/cgroup/isolate.slice/isolate.service
15 | cg_root = auto:/run/isolate/cgroup
16 | 
17 | # Block of UIDs and GIDs reserved for sandboxes
18 | first_uid = 60000
19 | first_gid = 60000
20 | num_boxes = 1000
21 | 
22 | # Only root can create new sandboxes (default: 0=everybody can)
23 | #restricted_init = 1
24 | 
25 | # Per-box settings of the set of allowed CPUs and NUMA nodes
26 | # (see linux/Documentation/cgroups/cpusets.txt for precise syntax)
27 | 
28 | #box0.cpus = 4-7
29 | #box0.mems = 1
30 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
 1 | isolate (2.2.1) stable; urgency=medium
 2 | 
 3 |   * New upstream release.
 4 | 
 5 |  -- Martin Mares <mj@ucw.cz>  Wed, 01 Oct 2025 17:48:31 +0200
 6 | 
 7 | isolate (2.2) stable; urgency=medium
 8 | 
 9 |   * New upstream release.
10 | 
11 |   * Added a dependency on useradd.
12 | 
13 |  -- Martin Mares <mj@ucw.cz>  Mon, 01 Sep 2025 11:44:24 +0200
14 | 
15 | isolate (2.1.2) stable; urgency=medium
16 | 
17 |   * New upstream release.
18 | 
19 |  -- Martin Mares <mj@ucw.cz>  Thu, 14 Aug 2025 15:47:24 +0200
20 | 
21 | isolate (2.1) stable; urgency=medium
22 | 
23 |   * New upstream release.
24 | 
25 |   * Isolate is available only to users who are members of the
26 |     "isolate" group.
27 | 
28 |  -- Martin Mares <mj@ucw.cz>  Sun, 08 Jun 2025 02:31:41 +0200
29 | 
30 | isolate (2.0-1) stable; urgency=medium
31 | 
32 |   * isolate-check-environment does not spew error messages
33 |     if $TERM is not defined.
34 | 
35 |  -- Martin Mares <mj@ucw.cz>  Fri, 21 Jun 2024 20:39:17 +0200
36 | 
37 | isolate (2.0) stable; urgency=medium
38 | 
39 |   * Initial release.
40 | 
41 |  -- Martin Mares <mj@ucw.cz>  Tue, 11 Jun 2024 16:44:34 +0200
42 | 


--------------------------------------------------------------------------------
/isolate-check-environment.8.txt:
--------------------------------------------------------------------------------
 1 | ISOLATE-CHECK-ENVIRONMENT(8)
 2 | ============================
 3 | 
 4 | NAME
 5 | ----
 6 | isolate-check-environment - Check for common environment quirks
 7 | 
 8 | SYNOPSIS
 9 | --------
10 | *isolate-check-environment* [*-q*|*--quiet*] [*e*|*--execute*]
11 | 
12 | DESCRIPTION
13 | -----------
14 | This script can be used to identify sources of run-time variability and other issues on
15 | Linux machines which may affect *isolate*(1).
16 | 
17 | If *--execute* is not specified, the recommended actions are written to stdout as an executable
18 | shell script. With *--execute*, the script will attempt to make changes to make the system
19 | behave more deterministically.
20 | 
21 | The changes performed by *--execute* persist only
22 | until a reboot. To persist across reboots, the standard output from this script
23 | should be added to `/etc/rc.local` or some other script that is run on each boot.
24 | Alternately, you could execute *isolate-check-environment --quiet --execute*
25 | from `rc.local`, but use this with caution as not all issues can be resolved in this way.
26 | 
27 | The exit status will be 0 if all checks pass, or 1 if some checks have failed.
28 | 
29 | Note that there are more strategies to reduce run-time variability further.
30 | See *isolate*(1) for details under REPRODUCIBILITY.
31 | 
32 | SEE ALSO
33 | --------
34 | *isolate*(1)
35 | 


--------------------------------------------------------------------------------
/debian/build/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | BUILD_DIR=build-tmp
 5 | 
 6 | build ()
 7 | {
 8 | 	local BUILD_IMAGE=isolate-build-$ARCH-$SUITE
 9 | 	local TEST_IMAGE=isolate-test-$ARCH-$SUITE
10 | 	local PLATFORM="--platform linux/$ARCH"
11 | 
12 | 	echo "### Building packages for $SUITE/$ARCH"
13 | 
14 | 	echo "# Updating build container"
15 | 	podman build $PLATFORM --file container-build/$SUITE --tag $BUILD_IMAGE
16 | 
17 | 	echo "# Updating build container"
18 | 	podman build $PLATFORM --file container-test/$SUITE --tag $TEST_IMAGE
19 | 
20 | 	echo "# Creating build directory"
21 | 	rm -rf $BDIR
22 | 	mkdir -p $BDIR
23 | 	cp do-build do-test $BDIR/
24 | 	( cd ../.. && git archive --prefix=isolate/ HEAD ) | ( cd $BDIR && tar x )
25 | 
26 | 	echo "# Building"
27 | 	podman run $PLATFORM -it --rm --volume ./$BDIR:/build --workdir /build $BUILD_IMAGE ./do-build
28 | 
29 | 	echo "# Testing"
30 | 	podman run $PLATFORM -it --rm --volume ./$BDIR:/build --workdir /build $BUILD_IMAGE ./do-test
31 | }
32 | 
33 | publish ()
34 | {
35 | 	echo "### Publishing packages for $SUITE/$ARCH"
36 | 	rsync $BDIR/*.deb jw:/projects/isolate/www/debian/dists/$SUITE-isolate/main/binary-$ARCH/
37 | }
38 | 
39 | try_dist ()
40 | {
41 | 	local SUITE=$1
42 | 	local ARCH=$2
43 | 	local BDIR=$BUILD_DIR/$ARCH-$SUITE
44 | 	$STEP
45 | }
46 | 
47 | try_all_dists ()
48 | {
49 | 	local STEP=$1
50 | 
51 | 	try_dist bookworm amd64
52 | 	try_dist trixie amd64
53 | 	try_dist noble amd64
54 | 	try_dist jammy amd64
55 | 
56 | 	# Cross-building for amd64 requires qemu-user-static installed
57 | 	try_dist bookworm arm64
58 | 	try_dist trixie arm64
59 | }
60 | 
61 | try_all_dists build
62 | 
63 | echo -n "Press Enter to publish packages or Ctrl-C to abort ... "
64 | read ENTER
65 | 
66 | try_all_dists publish
67 | 
68 | echo "### Updating package index"
69 | ssh -t jw 'cd /projects/isolate/debian && ./genpkg'
70 | 
71 | echo "### Cleaning up"
72 | rm -rf $BUILD_DIR
73 | 
74 | echo "### Done"
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | isolate
 2 | =======
 3 | 
 4 | Isolate is a sandbox built to safely run untrusted executables, like
 5 | programs submitted by competitors in a programming contest. Isolate
 6 | gives them a limited-access environment, preventing them from affecting
 7 | the host system. It takes advantage of features specific to the Linux
 8 | kernel, like namespaces and control groups.
 9 | 
10 | Isolate was developed by Martin Mareš (<mj@ucw.cz>) and Bernard Blackham
11 | (<bernard@blackham.com.au>) and still maintained by the former author.
12 | Several other people contributed patches for features and bug fixes
13 | (see Git history for a list). Thanks!
14 | 
15 | Originally, Isolate was a part of the [Moe Contest Environment](http://www.ucw.cz/moe/),
16 | but it evolved to a separate project used by different
17 | contest systems, most prominently [CMS](https://github.com/cms-dev/cms).
18 | It now lives at [GitHub](https://github.com/ioi/isolate),
19 | where you can submit bug reports and feature requests.
20 | 
21 | If you are interested in more details, please read Martin's and Bernard's
22 | papers on [Isolate's design](https://mj.ucw.cz/papers/isolate.pdf) and
23 | [grading system security](https://mj.ucw.cz/papers/secgrad.pdf) published
24 | in the Olympiads in Informatics journal.
25 | Also, Isolate's [manual page](http://www.ucw.cz/isolate/isolate.1.html)
26 | is available online.
27 | 
28 | ## Installing Isolate
29 | 
30 | To compile Isolate, you need:
31 | 
32 |   - pkg-config
33 | 
34 |   - headers for the libcap library (usually available in a libcap-dev package)
35 | 
36 |   - headers for the libsystemd library (libsystemd-dev package) for compilation
37 |     of isolate-cg-keeper
38 | 
39 | You may need `a2x` (found in [AsciiDoc](https://asciidoc-py.github.io/a2x.1.html)) for building manual.
40 | But if you only want the isolate binary, you can just run `make isolate`
41 | 
42 | Recommended system setup is described in sections INSTALLATION and REPRODUCIBILITY
43 | of the manual page.
44 | 
45 | ## Debian packages
46 | 
47 | Isolate is also available as packages for stable Debian Linux and last two LTS
48 | releases of Ubuntu, all on the amd64 architecture. To use them, add the following
49 | to your `/etc/apt/sources.list`:
50 | 
51 |     deb [arch=amd64 signed-by=/etc/apt/keyrings/isolate.asc] http://www.ucw.cz/isolate/debian/ bookworm-isolate main
52 | 
53 | You also need to install the repository's public key:
54 | 
55 |     curl https://www.ucw.cz/isolate/debian/signing-key.asc >/etc/apt/keyrings/isolate.asc
56 | 
57 | Then invoke:
58 | 
59 |     apt update && apt install isolate
60 | 
61 | There are experimental packages for the arm64 architecture, too.
62 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Version 2.2.1  [2025-09-29]
 2 | 
 3 |   *  Fixed the check for asymmetric cores in isolate-check-environment.
 4 | 
 5 | Version 2.2  [2025-09-01]
 6 | 
 7 |   *  Switched to a new kernel API for setting filesystem quotas,
 8 |      which works with more filesystems (e.g., tmpfs).
 9 | 
10 |   *  Wall-clock time is reported correctly even if the system
11 |      clock is re-set during program execution.
12 | 
13 | Version 2.1.2  [2025-08-14]
14 | 
15 |   *  Build date and commit are recorded in built binaries only
16 |      when building from a Git checkout.
17 | 
18 | Version 2.1.1  [2025-08-14]
19 | 
20 |   *  isolate-check-environment checks for asymmetric cores.
21 | 
22 |   *  By mistake, isolate was compiled without optimization.
23 |      Added -O2 to CFLAGS.
24 | 
25 |   *  Added packaging for Debian Trixie on both amd64 and arm64.
26 | 
27 | Version 2.1  [2025-06-08]
28 | 
29 |   •  There are official packages for Debian Bookworm (amd64, arm64)
30 |      and last two LTS releases of Ubuntu (amd64 only). Packaged Isolate
31 |      takes care of starting systemd services properly. It is available
32 |      only to users that are members of "isolate" group.
33 | 
34 |   •  Unit files for systemd are installed by default.
35 | 
36 |   •  Cgroup-based timing works when --run is used multiple times
37 |      on the same sandbox.
38 | 
39 |   •  Added manual pages for isolate-cg-keeper and isolate-check-environment.
40 |      The man page for isolate explains more about cgroups and containers.
41 | 
42 |   •  isolate-check-environment checks presence of simultaneous
43 |      multi-threading.
44 | 
45 |   •  All binaries are compiled with security hardening flags.
46 | 
47 |   •  Minor bug fixes.
48 | 
49 | Version 2.0  [2024-02-28]
50 | 
51 |   •  This version runs only on systems supporting CGroup v2, which are
52 |      basically all new Linux systems. If you need to stick with CGroup v1,
53 |      please use Isolate 1.10.1.
54 | 
55 |   •  If you are running systemd, Isolate now comes with isolate.service
56 |      that delegates a subtree of the cgroup hierarchy to Isolate.
57 |      On systems without systemd, you have to set up the delegation
58 |      yourself and set its root in Isolate's configuration file.
59 | 
60 |   •  The --cg-timing switch has been removed. In control group mode,
61 |      this mode of timing is always used.
62 | 
63 |   •  Added a simple protocol for locking sandboxes. When a sandbox is
64 |      initialized using "isolate --init", it is reserved for the calling
65 |      user until "isolate --cleanup" is used. It is also not allowed
66 |      to call "isolate --run" multiple times in parallel on the same box.
67 | 
68 |   •  "isolate --init" resets the sandbox if it already existed.
69 | 
70 |   •  Root can operate sandboxes on behalf of other users using
71 |      --as-uid and --as-gid options.
72 | 
73 |   •  Configuration can specify than only root is allowed to create new
74 |      sandboxes. Together with the previous feature, it allows for creation
75 |      of system-wide daemons allocating sandboxes to users. One such daemon
76 |      will probably appear in a future release of Isolate.
77 | 


--------------------------------------------------------------------------------
/isolate.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	Process Isolator
  3 |  *
  4 |  *	(c) 2012-2024 Martin Mares <mj@ucw.cz>
  5 |  *	(c) 2012-2014 Bernard Blackham <bernard@blackham.com.au>
  6 |  */
  7 | 
  8 | #include <stdarg.h>
  9 | #include <stdbool.h>
 10 | #include <stdint.h>
 11 | #include <sys/types.h>
 12 | #include <time.h>
 13 | 
 14 | #define NONRET __attribute__((noreturn))
 15 | #define UNUSED __attribute__((unused))
 16 | #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0]))
 17 | 
 18 | /* isolate.c */
 19 | 
 20 | void NONRET __attribute__((format(printf,1,2))) die(char *msg, ...);
 21 | void NONRET __attribute__((format(printf,1,2))) err(char *msg, ...);
 22 | void __attribute__((format(printf,1,2))) msg(char *msg, ...);
 23 | 
 24 | extern int pass_environ;
 25 | extern int verbose;
 26 | extern int block_quota;
 27 | extern int inode_quota;
 28 | extern int cg_enable;
 29 | extern int cg_memory_limit;
 30 | 
 31 | extern int box_id;
 32 | extern uid_t box_uid, orig_uid;
 33 | extern gid_t box_gid, orig_gid;
 34 | 
 35 | /* util.c */
 36 | 
 37 | void *xmalloc(size_t size);
 38 | char *xstrdup(char *str);
 39 | char * __attribute__((format(printf,1,2))) xsprintf(const char *fmt, ...);
 40 | 
 41 | void timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *result);
 42 | 
 43 | int dir_exists(char *path);
 44 | void rmtree(char *path);
 45 | void make_dir(char *path);
 46 | void make_dir_for(char *path);
 47 | void chowntree(char *path, uid_t uid, gid_t gid, bool keep_special_files);
 48 | void keep_fd(int fd);
 49 | void close_all_fds(void);
 50 | 
 51 | void meta_open(const char *name);
 52 | void meta_close(void);
 53 | void __attribute__((format(printf,1,2))) meta_printf(const char *fmt, ...);
 54 | 
 55 | /* rules.c */
 56 | 
 57 | int set_env_action(char *a0);
 58 | char **setup_environment(void);
 59 | 
 60 | void init_dir_rules(void);
 61 | int set_dir_action(char *arg);
 62 | void apply_dir_rules(int with_defaults);
 63 | 
 64 | void set_quota(void);
 65 | 
 66 | /* cg.c (without cg_enable, these functions do nothing) */
 67 | 
 68 | // Initialize CG machinery
 69 | void cg_init(void);
 70 | 
 71 | // Create a new CG for the box (during isolate --init)
 72 | void cg_create(void);
 73 | 
 74 | // Destroy the box CG (during isolate --cleanup)
 75 | void cg_remove(void);
 76 | 
 77 | // Prepare the box CG for use (during isolate --run)
 78 | void cg_setup(void);
 79 | 
 80 | // Move the current process to the box CG
 81 | void cg_enter(void);
 82 | 
 83 | // Obtain statistics on the box CG
 84 | int cg_get_run_time_ms(void);
 85 | void cg_stats(void);
 86 | 
 87 | /* config.c */
 88 | 
 89 | extern char *cf_box_root;
 90 | extern char *cf_lock_root;
 91 | extern char *cf_cg_root;
 92 | extern int cf_first_uid;
 93 | extern int cf_first_gid;
 94 | extern int cf_num_boxes;
 95 | extern int cf_restricted_init;
 96 | 
 97 | struct cf_per_box {
 98 |   struct cf_per_box *next;
 99 |   int box_id;
100 |   char *cpus;
101 |   char *mems;
102 | };
103 | 
104 | void cf_parse(void);
105 | struct cf_per_box *cf_per_box(int box_id);
106 | 
107 | static inline struct cf_per_box *
108 | cf_current_box(void)
109 | {
110 |   return cf_per_box(box_id);
111 | }
112 | 


--------------------------------------------------------------------------------
/isolate-cg-keeper.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	A Trivial Helper Daemon for Keeping Control Groups in SystemD
  3 |  *
  4 |  *	(c) 2022--2024 Martin Mares <mj@ucw.cz>
  5 |  */
  6 | 
  7 | #include "isolate.h"
  8 | 
  9 | #include <fcntl.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <string.h>
 13 | #include <unistd.h>
 14 | #include <sys/stat.h>
 15 | #include <systemd/sd-daemon.h>
 16 | 
 17 | #define CGROUP_FS "/sys/fs/cgroup"
 18 | 
 19 | void NONRET __attribute__((format(printf,1,2)))
 20 | die(char *msg, ...)
 21 | {
 22 |   va_list args;
 23 |   va_start(args, msg);
 24 |   vfprintf(stderr, msg, args);
 25 |   fputc('\n', stderr);
 26 |   exit(1);
 27 | }
 28 | 
 29 | static void __attribute__((format(printf,3,4)))
 30 | write_cg_attr(const char *cg_root, const char *name, const char *fmt, ...)
 31 | {
 32 |   va_list args;
 33 |   va_start(args, fmt);
 34 | 
 35 |   char namebuf[1024];
 36 |   snprintf(namebuf, sizeof(namebuf), "%s/%s", cg_root, name);
 37 | 
 38 |   char valbuf[1024];
 39 |   vsnprintf(valbuf, sizeof(valbuf), fmt, args);
 40 |   int len = strlen(valbuf);
 41 | 
 42 |   int fd = open(namebuf, O_WRONLY);
 43 |   if (fd < 0)
 44 |     die("Cannot open %s: %m", namebuf);
 45 | 
 46 |   if (write(fd, valbuf, len) != len)
 47 |     die("Cannot write to %s: %m", namebuf);
 48 | 
 49 |   close(fd);
 50 |   va_end(args);
 51 | }
 52 | 
 53 | static void
 54 | check_cgroup_fs(void)
 55 | {
 56 |   struct stat st;
 57 | 
 58 |   if (stat(CGROUP_FS, &st) < 0)
 59 |     die("Cannot find %s: %m", CGROUP_FS);
 60 | 
 61 |   if (stat(CGROUP_FS "/unified", &st) >= 0)
 62 |     die("Combined cgroup v1+v2 mode is not supported");
 63 | 
 64 |   if (stat(CGROUP_FS "/cgroup.subtree_control", &st) < 0)
 65 |     die("Cgroup v2 not found");
 66 | }
 67 | 
 68 | static char *
 69 | get_my_cgroup(void)
 70 | {
 71 |   FILE *f = fopen("/proc/self/cgroup", "r");
 72 |   if (!f)
 73 |     die("Cannot open /proc/self/cgroup: %m");
 74 | 
 75 |   char *line = NULL;
 76 |   size_t buflen = 0;
 77 |   ssize_t len;
 78 |   char *cg = NULL;
 79 | 
 80 |   while ((len = getline(&line, &buflen, f)) >= 0)
 81 |     {
 82 |       if (len > 0 && line[len-1] == '\n')
 83 | 	line[--len] = 0;
 84 |       if (line[0] == '0' && line[1] == ':' && line[2] == ':')
 85 | 	{
 86 | 	  cg = xsprintf(CGROUP_FS "%s", line + 3);
 87 | 	  break;
 88 | 	}
 89 |     }
 90 | 
 91 |   if (!cg)
 92 |     die("Cannot find my own cgroup");
 93 | 
 94 |   free(line);
 95 |   fclose(f);
 96 |   return cg;
 97 | }
 98 | 
 99 | static void
100 | write_auto_cgroup(char *file, char *cg)
101 | {
102 |   make_dir_for(file);
103 | 
104 |   FILE *f = fopen(file, "w");
105 |   if (!f)
106 |     die("Cannot create %s: %m", file);
107 |   fprintf(f, "%s\n", cg);
108 |   fclose(f);
109 | }
110 | 
111 | static void
112 | setup_cg(void)
113 | {
114 |   char *cg = cf_cg_root;
115 |   if (strlen(cf_cg_root) > 5 && !memcmp(cf_cg_root, "auto:", 5))
116 |     {
117 |       check_cgroup_fs();
118 |       cg = get_my_cgroup();
119 |       write_auto_cgroup(cf_cg_root + 5, cg);
120 |     }
121 | 
122 |   struct stat st;
123 |   if (stat(cg, &st), 0)
124 |     die("Control group root %s does not exist: %m", cg);
125 | 
126 |   char subgroup[1024];
127 |   snprintf(subgroup, sizeof(subgroup), "%s/daemon", cg);
128 |   if (mkdir(subgroup, 0777) < 0)
129 |     die("Cannot create subgroup %s: %m", subgroup);
130 | 
131 |   write_cg_attr(cg, "daemon/cgroup.procs", "%d\n", (int) getpid());
132 |   write_cg_attr(cg, "cgroup.subtree_control", "+cpuset +memory\n");
133 | }
134 | 
135 | int
136 | main(int argc UNUSED, char **argv UNUSED)
137 | {
138 |   cf_parse();
139 |   setup_cg();
140 |   sd_notify(0, "READY=1");
141 |   for (;;)
142 |     pause();
143 | }
144 | 


--------------------------------------------------------------------------------
/config.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	Process Isolator -- Configuration File
  3 |  *
  4 |  *	(c) 2016--2023 Martin Mares <mj@ucw.cz>
  5 |  */
  6 | 
  7 | #include "isolate.h"
  8 | 
  9 | #include <errno.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <string.h>
 13 | 
 14 | #define MAX_LINE_LEN 1024
 15 | 
 16 | char *cf_box_root;
 17 | char *cf_lock_root;
 18 | char *cf_cg_root;
 19 | int cf_first_uid;
 20 | int cf_first_gid;
 21 | int cf_num_boxes;
 22 | int cf_restricted_init;
 23 | 
 24 | static int line_number;
 25 | static struct cf_per_box *per_box_configs;
 26 | 
 27 | static void NONRET
 28 | cf_err(char *msg)
 29 | {
 30 |   die("Error in config file, line %d: %s", line_number, msg);
 31 | }
 32 | 
 33 | static char *
 34 | cf_string(char *val)
 35 | {
 36 |   return xstrdup(val);
 37 | }
 38 | 
 39 | static int
 40 | cf_int(char *val)
 41 | {
 42 |   char *end;
 43 |   errno = 0;
 44 |   long int x = strtol(val, &end, 10);
 45 |   if (errno || end == val || end && *end)
 46 |     cf_err("Invalid number");
 47 |   if ((long int)(int) x != x)
 48 |     cf_err("Number out of range");
 49 |   return x;
 50 | }
 51 | 
 52 | static void
 53 | cf_entry_toplevel(char *key, char *val)
 54 | {
 55 |   if (!strcmp(key, "box_root"))
 56 |     cf_box_root = cf_string(val);
 57 |   else if (!strcmp(key, "lock_root"))
 58 |     cf_lock_root = cf_string(val);
 59 |   else if (!strcmp(key, "cg_root"))
 60 |     cf_cg_root = cf_string(val);
 61 |   else if (!strcmp(key, "first_uid"))
 62 |     cf_first_uid = cf_int(val);
 63 |   else if (!strcmp(key, "first_gid"))
 64 |     cf_first_gid = cf_int(val);
 65 |   else if (!strcmp(key, "num_boxes"))
 66 |     cf_num_boxes = cf_int(val);
 67 |   else if (!strcmp(key, "restricted_init"))
 68 |     cf_restricted_init = cf_int(val);
 69 |   else
 70 |     cf_err("Unknown configuration item");
 71 | }
 72 | 
 73 | static void
 74 | cf_entry_compound(char *key, char *subkey, char *val)
 75 | {
 76 |   if (strncmp(key, "box", 3))
 77 |     cf_err("Unknown configuration section");
 78 |   int box_id = cf_int(key + 3);
 79 |   struct cf_per_box *c = cf_per_box(box_id);
 80 | 
 81 |   if (!strcmp(subkey, "cpus"))
 82 |     c->cpus = cf_string(val);
 83 |   else if (!strcmp(subkey, "mems"))
 84 |     c->mems = cf_string(val);
 85 |   else
 86 |     cf_err("Unknown per-box configuration item");
 87 | }
 88 | 
 89 | static void
 90 | cf_entry(char *key, char *val)
 91 | {
 92 |   char *dot = strchr(key, '.');
 93 |   if (!dot)
 94 |     cf_entry_toplevel(key, val);
 95 |   else
 96 |     {
 97 |       *dot++ = 0;
 98 |       cf_entry_compound(key, dot, val);
 99 |     }
100 | }
101 | 
102 | static void
103 | cf_check(void)
104 | {
105 |   if (!cf_box_root ||
106 |       !cf_lock_root ||
107 |       !cf_cg_root ||
108 |       !cf_first_uid ||
109 |       !cf_first_gid ||
110 |       !cf_num_boxes)
111 |     cf_err("Configuration is not complete");
112 | }
113 | 
114 | void
115 | cf_parse(void)
116 | {
117 |   FILE *f = fopen(CONFIG_FILE, "r");
118 |   if (!f)
119 |     die("Cannot open %s: %m", CONFIG_FILE);
120 | 
121 |   char line[MAX_LINE_LEN];
122 |   while (fgets(line, sizeof(line), f))
123 |     {
124 |       line_number++;
125 |       char *nl = strchr(line, '\n');
126 |       if (!nl)
127 | 	cf_err("Line not terminated or too long");
128 |       *nl = 0;
129 | 
130 |       if (!line[0] || line[0] == '#')
131 | 	continue;
132 | 
133 |       char *s = line;
134 |       while (*s && *s != ' ' && *s != '\t' && *s != '=')
135 | 	s++;
136 |       while (*s == ' ' || *s == '\t')
137 | 	*s++ = 0;
138 |       if (*s != '=')
139 | 	cf_err("Syntax error, expecting key=value");
140 |       *s++ = 0;
141 |       while (*s == ' ' || *s == '\t')
142 | 	*s++ = 0;
143 | 
144 |       cf_entry(line, s);
145 |     }
146 | 
147 |   fclose(f);
148 |   cf_check();
149 | }
150 | 
151 | struct cf_per_box *
152 | cf_per_box(int box_id)
153 | {
154 |   struct cf_per_box *c;
155 | 
156 |   for (c = per_box_configs; c; c = c->next)
157 |     if (c->box_id == box_id)
158 |       return c;
159 | 
160 |   c = xmalloc(sizeof(*c));
161 |   memset(c, 0, sizeof(*c));
162 |   c->next = per_box_configs;
163 |   per_box_configs = c;
164 |   c->box_id = box_id;
165 |   return c;
166 | }
167 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Isolate
  2 | # (c) 2015--2025 Martin Mares <mj@ucw.cz>
  3 | # (c) 2017 Bernard Blackham <bernard@blackham.com.au>
  4 | 
  5 | VERSION=2.2.1
  6 | YEAR=2025
  7 | 
  8 | PROGRAMS=isolate isolate-check-environment isolate-cg-keeper
  9 | MANPAGES=isolate.1 isolate-check-environment.8 isolate-cg-keeper.8
 10 | CONFIGS=default.cf systemd/isolate.slice systemd/isolate.service
 11 | 
 12 | all: $(PROGRAMS) $(MANPAGES) $(addsuffix .html, $(MANPAGES)) $(CONFIGS)
 13 | 
 14 | CC=gcc
 15 | CFLAGS=-std=gnu99 -O2 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers -Wstrict-prototypes -Wmissing-prototypes $(CFLAGS_HARDEN) -D_GNU_SOURCE $(CFLAGS_EXTRA)
 16 | LDFLAGS=$(LDFLAGS_HARDEN)
 17 | LIBS=-lcap
 18 | 
 19 | # Inspiration: https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html
 20 | CFLAGS_HARDEN=-D_FORTIFY_SOURCE=3 -fstack-protector-strong -fstack-clash-protection -fPIE -pie
 21 | LDFLAGS_HARDEN=-Wl,-z,nodlopen -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now
 22 | 
 23 | CFLAGS_BUILD=-DISOLATE_VERSION='"$(VERSION)"' -DISOLATE_YEAR='"$(YEAR)"'
 24 | 
 25 | # If we are building from a checked out repository, include build date and commit
 26 | BUILD_FROM_GIT := $(shell if [ -d .git ] ; then echo yes ; fi)
 27 | ifdef BUILD_FROM_GIT
 28 | BUILD_DATE := $(shell date '+%Y-%m-%d')
 29 | BUILD_COMMIT := $(shell if git rev-parse >/dev/null 2>/dev/null ; then git describe --always --tags ; else echo '<unknown>' ; fi)
 30 | CFLAGS_BUILD += -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"'
 31 | endif
 32 | 
 33 | PREFIX = /usr/local
 34 | VARPREFIX = /var/local
 35 | CONFIGDIR = $(PREFIX)/etc
 36 | CONFIG = $(CONFIGDIR)/isolate
 37 | BINDIR = $(PREFIX)/bin
 38 | LIBDIR = $(PREFIX)/lib
 39 | SBINDIR = $(PREFIX)/sbin
 40 | DATADIR = $(PREFIX)/share
 41 | MANDIR = $(DATADIR)/man
 42 | MAN1DIR = $(MANDIR)/man1
 43 | MAN8DIR = $(MANDIR)/man8
 44 | BOXDIR = $(VARPREFIX)/lib/isolate
 45 | UNITDIR = $(LIBDIR)/systemd/system
 46 | 
 47 | SYSTEMD_CFLAGS := $(shell pkg-config libsystemd --cflags)
 48 | SYSTEMD_LIBS := $(shell pkg-config libsystemd --libs)
 49 | 
 50 | isolate: isolate.o util.o rules.o cg.o config.o
 51 | 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
 52 | 
 53 | isolate-cg-keeper: isolate-cg-keeper.o config.o util.o
 54 | 	$(CC) $(LDFLAGS) -o $@ $^ $(SYSTEMD_LIBS)
 55 | 
 56 | %.o: %.c isolate.h
 57 | 	$(CC) $(CFLAGS) -c -o $@ $<
 58 | 
 59 | isolate.o: CFLAGS += $(CFLAGS_BUILD)
 60 | config.o: CFLAGS += -DCONFIG_FILE='"$(CONFIG)"'
 61 | isolate-cg-keeper.o: CFLAGS += $(SYSTEMD_CFLAGS)
 62 | 
 63 | %.1: %.1.txt
 64 | 	a2x -f manpage $<
 65 | 
 66 | %.8: %.8.txt
 67 | 	a2x -f manpage $<
 68 | 
 69 | # The dependency on %.1 is there to serialize both calls of asciidoc,
 70 | # which does not name temporary files safely.
 71 | %.1.html: %.1.txt %.1
 72 | 	a2x -f xhtml -D . $<
 73 | 
 74 | %.8.html: %.8.txt %.8
 75 | 	a2x -f xhtml -D . $<
 76 | 
 77 | %: %.in
 78 | 	sed "s|@SBINDIR@|$(SBINDIR)|g; s|@BOXDIR@|$(BOXDIR)|g" <$< >$@
 79 | 
 80 | clean:
 81 | 	rm -f *.o
 82 | 	rm -f isolate isolate-cg-keeper
 83 | 	rm -f $(MANPAGES) $(addsuffix .html, $(MANPAGES))
 84 | 	rm -f docbook-xsl.css
 85 | 	rm -f default.cf
 86 | 	rm -f systemd/isolate.service
 87 | 
 88 | install: $(PROGRAMS) $(CONFIGS)
 89 | 	install -d $(DESTDIR)$(BINDIR) $(DESTDIR)$(SBINDIR) $(DESTDIR)$(BOXDIR) $(DESTDIR)$(CONFIGDIR) $(DESTDIR)$(UNITDIR)
 90 | 	install isolate-check-environment $(DESTDIR)$(BINDIR)
 91 | 	install isolate-cg-keeper $(DESTDIR)$(SBINDIR)
 92 | 	install -m 4755 isolate $(DESTDIR)$(BINDIR)
 93 | 	install -m 644 default.cf $(DESTDIR)$(CONFIG)
 94 | 	install -m 644 systemd/isolate.slice systemd/isolate.service $(DESTDIR)$(UNITDIR)
 95 | 
 96 | install-doc: $(MANPAGES)
 97 | 	install -d $(DESTDIR)$(MAN1DIR) $(DESTDIR)$(MAN8DIR)
 98 | 	install -m 644 isolate.1 $(DESTDIR)$(MAN1DIR)/
 99 | 	install -m 644 isolate-check-environment.8 isolate-cg-keeper.8 $(DESTDIR)$(MAN8DIR)/
100 | 
101 | release: $(addsuffix .html,$(MANPAGES))
102 | 	git tag v$(VERSION)
103 | 	git push --tags
104 | 	git archive --format=tar --prefix=isolate-$(VERSION)/ HEAD | gzip >isolate-$(VERSION).tar.gz
105 | 	rsync isolate-$(VERSION).tar.gz jw:/home/ftp/pub/mj/isolate/
106 | 	rsync $(addsuffix .html,$(MANPAGES)) jw:/projects/isolate/www/
107 | 	ssh jw 'cd web && bin/release-prog isolate $(VERSION)'
108 | 
109 | .PHONY: all clean install install-doc release
110 | 


--------------------------------------------------------------------------------
/cg.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	Process Isolator -- Control Groups
  3 |  *
  4 |  *	(c) 2012-2024 Martin Mares <mj@ucw.cz>
  5 |  *	(c) 2012-2014 Bernard Blackham <bernard@blackham.com.au>
  6 |  */
  7 | 
  8 | #include "isolate.h"
  9 | 
 10 | #include <assert.h>
 11 | #include <errno.h>
 12 | #include <fcntl.h>
 13 | #include <limits.h>
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | #include <sys/stat.h>
 18 | #include <unistd.h>
 19 | 
 20 | static char cg_name[256];
 21 | 
 22 | #define CG_BUFSIZE 1024
 23 | 
 24 | static void
 25 | cg_makepath(char *buf, size_t len, const char *attr)
 26 | {
 27 |   int out;
 28 |   if (attr)
 29 |     out = snprintf(buf, len, "%s/%s/%s", cf_cg_root, cg_name, attr);
 30 |   else
 31 |     out = snprintf(buf, len, "%s/%s", cf_cg_root, cg_name);
 32 |   assert((size_t) out < len);
 33 | }
 34 | 
 35 | static int
 36 | cg_read(const char *attr, char *buf)
 37 | {
 38 |   int result = 0;
 39 |   int maybe = 0;
 40 |   if (attr[0] == '?')
 41 |     {
 42 |       attr++;
 43 |       maybe = 1;
 44 |     }
 45 | 
 46 |   char path[PATH_MAX];
 47 |   cg_makepath(path, sizeof(path), attr);
 48 | 
 49 |   int fd = open(path, O_RDONLY);
 50 |   if (fd < 0)
 51 |     {
 52 |       if (maybe)
 53 | 	goto fail;
 54 |       die("Cannot read %s: %m", path);
 55 |     }
 56 | 
 57 |   int n = read(fd, buf, CG_BUFSIZE);
 58 |   if (n < 0)
 59 |     {
 60 |       if (maybe)
 61 | 	goto fail_close;
 62 |       die("Cannot read %s: %m", path);
 63 |     }
 64 |   if (n >= CG_BUFSIZE - 1)
 65 |     die("Attribute %s too long", path);
 66 |   if (n > 0 && buf[n-1] == '\n')
 67 |     n--;
 68 |   buf[n] = 0;
 69 | 
 70 |   if (verbose > 1)
 71 |     msg("CG: Read %s = <%s>\n", attr, buf);
 72 | 
 73 |   result = 1;
 74 | fail_close:
 75 |   close(fd);
 76 | fail:
 77 |   return result;
 78 | }
 79 | 
 80 | static void __attribute__((format(printf,2,3)))
 81 | cg_write(const char *attr, const char *fmt, ...)
 82 | {
 83 |   int maybe = 0;
 84 |   if (attr[0] == '?')
 85 |     {
 86 |       attr++;
 87 |       maybe = 1;
 88 |     }
 89 | 
 90 |   va_list args;
 91 |   va_start(args, fmt);
 92 | 
 93 |   char buf[CG_BUFSIZE];
 94 |   int n = vsnprintf(buf, sizeof(buf), fmt, args);
 95 |   if (n >= CG_BUFSIZE)
 96 |     die("cg_write: Value for attribute %s is too long", attr);
 97 | 
 98 |   if (verbose > 1)
 99 |     msg("CG: Write %s = %s", attr, buf);
100 | 
101 |   char path[PATH_MAX];
102 |   cg_makepath(path, sizeof(path), attr);
103 | 
104 |   int fd = open(path, O_WRONLY | O_TRUNC);
105 |   if (fd < 0)
106 |     {
107 |       if (maybe)
108 | 	goto fail;
109 |       else
110 | 	die("Cannot write %s: %m", path);
111 |     }
112 | 
113 |   int written = write(fd, buf, n);
114 |   if (written < 0)
115 |     {
116 |       if (maybe)
117 | 	goto fail_close;
118 |       else
119 | 	die("Cannot set %s to %s: %m", path, buf);
120 |     }
121 |   if (written != n)
122 |     die("Short write to %s (%d out of %d bytes)", path, written, n);
123 | 
124 | fail_close:
125 |   close(fd);
126 | fail:
127 |   va_end(args);
128 | }
129 | 
130 | static FILE *cg_fopen(const char *attr)
131 | {
132 |   char path[PATH_MAX];
133 |   cg_makepath(path, sizeof(path), attr);
134 | 
135 |   FILE *f = fopen(path, "r");
136 |   if (!f)
137 |     die("Cannot open %s: %m", path);
138 | 
139 |   return f;
140 | }
141 | 
142 | static void cg_fclose(FILE *f)
143 | {
144 |   if (ferror(f))
145 |     die("Read error on cgroup attributes: %m");
146 |   fclose(f);
147 | }
148 | 
149 | static int cg_fread_kv(FILE *f, char *key, char *val)
150 | {
151 |   char line[CG_BUFSIZE];
152 | 
153 |   if (!fgets(line, sizeof(line), f))
154 |     return 0;
155 | 
156 |   char *eol = strchr(line, '\n');
157 |   if (!eol)
158 |     die("Non-terminated or too long line in cgroup key-value file");
159 |   *eol = 0;
160 | 
161 |   char *space = strchr(line, ' ');
162 |   if (!space)
163 |     die("Missing space in cgroup key-value file");
164 |   *space = 0;
165 | 
166 |   strcpy(key, line);
167 |   strcpy(val, space + 1);
168 |   return 1;
169 | }
170 | 
171 | void
172 | cg_init(void)
173 | {
174 |   if (!cg_enable)
175 |     return;
176 | 
177 |   if (strlen(cf_cg_root) > 5 && !memcmp(cf_cg_root, "auto:", 5))
178 |     {
179 |       char *filename = cf_cg_root + 5;
180 |       FILE *f = fopen(filename, "r");
181 |       if (!f)
182 | 	die("Cannot open %s: %m", filename);
183 | 
184 |       char *line = NULL;
185 |       size_t len;
186 |       if (getline(&line, &len, f) < 0)
187 | 	die("Cannot read from %s: %m", filename);
188 | 
189 |       char *sep = strchr(line, '\n');
190 |       if (sep)
191 | 	*sep = 0;
192 | 
193 |       fclose(f);
194 |       cf_cg_root = line;
195 |     }
196 | 
197 |   if (!dir_exists(cf_cg_root))
198 |     die("Control group root %s does not exist", cf_cg_root);
199 | 
200 |   snprintf(cg_name, sizeof(cg_name), "box-%d", box_id);
201 | 
202 |   msg("Using control group %s under parent %s\n", cg_name, cf_cg_root);
203 | }
204 | 
205 | void
206 | cg_create(void)
207 | {
208 |   if (!cg_enable)
209 |     return;
210 | 
211 |   struct stat st;
212 |   char path[PATH_MAX];
213 | 
214 |   cg_makepath(path, sizeof(path), NULL);
215 |   if (stat(path, &st) >= 0 || errno != ENOENT)
216 |     {
217 |       msg("Control group %s already exists, trying to empty it.\n", path);
218 |       if (rmdir(path) < 0)
219 | 	die("Failed to reset control group %s: %m", path);
220 |     }
221 | 
222 |   if (mkdir(path, 0777))
223 |     die("Failed to create control group %s: %m", path);
224 | }
225 | 
226 | void
227 | cg_enter(void)
228 | {
229 |   if (!cg_enable)
230 |     return;
231 | 
232 |   msg("Entering control group %s\n", cg_name);
233 | 
234 |   cg_write("cgroup.procs", "%d\n", (int) getpid());
235 | 
236 |   if (cg_memory_limit)
237 |     {
238 |       cg_write("memory.max", "%lld\n", (long long) cg_memory_limit << 10);
239 |       cg_write("?memory.swap.max", "0\n");
240 |     }
241 | 
242 |   struct cf_per_box *cf = cf_current_box();
243 |   if (cf->cpus)
244 |     cg_write("cpuset.cpus", "%s", cf->cpus);
245 |   if (cf->mems)
246 |     cg_write("cpuset.mems", "%s", cf->mems);
247 | }
248 | 
249 | static int
250 | raw_get_run_time_ms(void)
251 | {
252 |   FILE *f = cg_fopen("cpu.stat");
253 |   unsigned long long usec = 0;
254 |   bool found_usage = false;
255 | 
256 |   char key[CG_BUFSIZE], val[CG_BUFSIZE];
257 |   while (cg_fread_kv(f, key, val))
258 |     {
259 |       if (!strcmp(key, "usage_usec"))
260 | 	{
261 | 	  usec = atoll(val);
262 | 	  found_usage = true;
263 | 	}
264 |     }
265 | 
266 |   cg_fclose(f);
267 |   if (!found_usage)
268 |     die("Missing usage_usec in cpu.stat");
269 | 
270 |   return usec / 1000;
271 | }
272 | 
273 | static int cg_time_offset;
274 | 
275 | int
276 | cg_get_run_time_ms(void)
277 | {
278 |   if (!cg_enable)
279 |     return 0;
280 | 
281 |   return raw_get_run_time_ms() - cg_time_offset;
282 | }
283 | 
284 | void
285 | cg_setup(void)
286 | {
287 |   if (!cg_enable)
288 |     return;
289 | 
290 |   /*
291 |    *  The box CG can be used by multiple invocations of "isolate --run",
292 |    *  but cpu.stat is cummulative and cannot be reset. So we subtract
293 |    *  the initial value of cpu.stat.
294 |    */
295 |   cg_time_offset = raw_get_run_time_ms();
296 |   if (verbose > 1)
297 |     msg("CG: Time offset = %d", cg_time_offset);
298 | }
299 | 
300 | void
301 | cg_stats(void)
302 | {
303 |   if (!cg_enable)
304 |     return;
305 | 
306 |   char key[CG_BUFSIZE], val[CG_BUFSIZE];
307 | 
308 |   unsigned long long mem=0;
309 |   if (cg_read("?memory.peak", val))
310 |     mem = atoll(val);
311 |   if (mem)
312 |     meta_printf("cg-mem:%lld\n", mem >> 10);
313 | 
314 |   // OOM kill detection
315 |   FILE *f = cg_fopen("memory.events");
316 |   while (cg_fread_kv(f, key, val))
317 |     {
318 |       if (!strcmp(key, "oom_kill") && atoll(val))
319 | 	{
320 | 	  meta_printf("cg-oom-killed:1\n");
321 | 	  break;
322 | 	}
323 |     }
324 |   cg_fclose(f);
325 | }
326 | 
327 | void
328 | cg_remove(void)
329 | {
330 |   if (!cg_enable)
331 |     return;
332 | 
333 |   char path[PATH_MAX];
334 |   cg_makepath(path, sizeof(path), NULL);
335 | 
336 |   if (dir_exists(path))
337 |     {
338 |       msg("Removing control group\n");
339 | 
340 |       cg_write("?cgroup.kill", "1\n");
341 | 
342 |       if (rmdir(path) < 0)
343 | 	die("Cannot remove control group %s: %m", path);
344 |     }
345 | }
346 | 


--------------------------------------------------------------------------------
/util.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	Process Isolator -- Utility Functions
  3 |  *
  4 |  *	(c) 2012-2023 Martin Mares <mj@ucw.cz>
  5 |  *	(c) 2012-2014 Bernard Blackham <bernard@blackham.com.au>
  6 |  */
  7 | 
  8 | #include "isolate.h"
  9 | 
 10 | #include <assert.h>
 11 | #include <dirent.h>
 12 | #include <errno.h>
 13 | #include <fcntl.h>
 14 | #include <stdarg.h>
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <sys/fsuid.h>
 19 | #include <sys/stat.h>
 20 | #include <time.h>
 21 | #include <unistd.h>
 22 | 
 23 | void *
 24 | xmalloc(size_t size)
 25 | {
 26 |   void *p = malloc(size);
 27 |   if (!p)
 28 |     die("Out of memory");
 29 |   return p;
 30 | }
 31 | 
 32 | char *
 33 | xstrdup(char *str)
 34 | {
 35 |   char *p = strdup(str);
 36 |   if (!p)
 37 |     die("Out of memory");
 38 |   return p;
 39 | }
 40 | 
 41 | char *xsprintf(const char *fmt, ...)
 42 | {
 43 |   va_list args;
 44 |   va_start(args, fmt);
 45 | 
 46 |   char *out;
 47 |   int res = vasprintf(&out, fmt, args);
 48 |   if (res < 0)
 49 |     die("Out of memory");
 50 | 
 51 |   va_end(args);
 52 |   return out;
 53 | }
 54 | 
 55 | void
 56 | timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *result)
 57 | {
 58 |   result->tv_sec  = a->tv_sec - b->tv_sec;
 59 |   result->tv_nsec = a->tv_nsec - b->tv_nsec;
 60 | 
 61 |   if (result->tv_nsec < 0)
 62 |   {
 63 |     result->tv_sec  -= 1;
 64 |     result->tv_nsec += 1000000000L;
 65 |   }
 66 | }
 67 | 
 68 | int
 69 | dir_exists(char *path)
 70 | {
 71 |   struct stat st;
 72 |   return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode));
 73 | }
 74 | 
 75 | void
 76 | make_dir(char *path)
 77 | {
 78 |   char *sep = (path[0] == '/' ? path+1 : path);
 79 | 
 80 |   for (;;)
 81 |     {
 82 |       sep = strchr(sep, '/');
 83 |       if (sep)
 84 | 	*sep = 0;
 85 | 
 86 |       if (mkdir(path, 0777) < 0 && errno != EEXIST)
 87 | 	die("Cannot create directory %s: %m", path);
 88 | 
 89 |       if (!sep)
 90 | 	break;
 91 |       *sep++ = '/';
 92 |     }
 93 | 
 94 |  // mkdir() above may have returned EEXIST even if the path was not
 95 |  // a directory. Ensure that it is.
 96 |   struct stat st;
 97 |   if (stat(path, &st) < 0)
 98 |     die("Cannot stat %s: %m", path);
 99 |   if (!S_ISDIR(st.st_mode))
100 |     die("Cannot create %s: already exists, but not a directory", path);
101 | }
102 | 
103 | void make_dir_for(char *path)
104 | {
105 |   char *copy = xstrdup(path);
106 |   char *last_slash = strrchr(copy, '/');
107 |   if (last_slash)
108 |     {
109 |       *last_slash = 0;
110 |       make_dir(copy);
111 |     }
112 |   free(copy);
113 | }
114 | 
115 | /*
116 |  *  Once upon a time, we used nftw() for traversing directory trees.
117 |  *  It was simple, but unfortunately prone to symlink swapping attacks.
118 |  *  Using FTW_CHDIR would prevent the attacks, but it interacts badly with
119 |  *  FTW_DEPTH which we need when removing directory trees. See bug report at
120 |  *  https://sourceware.org/bugzilla/show_bug.cgi?id=28831.
121 |  *
122 |  *  We therefore switched to our implementation based on using openat(),
123 |  *  fstatat() and similar functions.
124 |  */
125 | 
126 | struct walk_context {
127 |     // Current item
128 |     int dir_fd;
129 |     const char *name;
130 |     bool is_dir;
131 |     struct stat st;
132 | 
133 |     // Common for the whole walk
134 |     dev_t root_dev;
135 |     void (*callback)(struct walk_context *ctx);
136 | 
137 |     // Used by our callbacks
138 |     uid_t chown_uid;
139 |     gid_t chown_gid;
140 |     bool keep_special_files;
141 | };
142 | 
143 | static void
144 | walktree_ctx(struct walk_context *ctx)
145 | {
146 |   DIR *dir = fdopendir(ctx->dir_fd);
147 |   if (!dir)
148 |     die("fdopendir failed: %m");
149 | 
150 |   struct dirent *de;
151 |   while (de = readdir(dir))
152 |     {
153 |       ctx->name = de->d_name;
154 | 
155 |       if (!strcmp(ctx->name, ".") || !strcmp(ctx->name, ".."))
156 | 	continue;
157 | 
158 |       if (fstatat(ctx->dir_fd, ctx->name, &ctx->st, AT_SYMLINK_NOFOLLOW) < 0)
159 | 	die("Cannot stat %s: %m", ctx->name);
160 | 
161 |       if (ctx->st.st_dev != ctx->root_dev)
162 | 	die("Unexpected mountpoint: %s", ctx->name);
163 | 
164 |       if (S_ISDIR(ctx->st.st_mode))
165 | 	{
166 | 	  struct walk_context subdir = *ctx;
167 | 	  subdir.dir_fd = openat(ctx->dir_fd, ctx->name, O_RDONLY | O_DIRECTORY | O_NOFOLLOW);
168 | 	  if (subdir.dir_fd < 0)
169 | 	    die("Cannot open directory %s: %m", ctx->name);
170 | 	  walktree_ctx(&subdir);
171 | 	  ctx->is_dir = true;
172 | 	  ctx->callback(ctx);
173 | 	}
174 |       else
175 | 	{
176 | 	  ctx->is_dir = false;
177 | 	  ctx->callback(ctx);
178 | 	}
179 |     }
180 | 
181 |   closedir(dir);
182 | }
183 | 
184 | static void
185 | walktree(struct walk_context *ctx, const char *path, void (*callback)(struct walk_context *ctx))
186 | {
187 |   ctx->callback = callback;
188 |   ctx->dir_fd = AT_FDCWD;
189 |   ctx->name = path;
190 | 
191 |   struct walk_context top = *ctx;
192 |   top.dir_fd = open(path, O_RDONLY | O_DIRECTORY);
193 |   if (top.dir_fd < 0)
194 |     die("Cannot open directory %s: %m", path);
195 | 
196 |   if (fstat(top.dir_fd, &ctx->st) < 0)
197 |     die("Cannot stat %s: %m", path);
198 |   assert(S_ISDIR(ctx->st.st_mode));
199 |   top.root_dev = ctx->st.st_dev;
200 | 
201 |   walktree_ctx(&top);
202 | 
203 |   ctx->is_dir = true;
204 |   ctx->callback(ctx);
205 | }
206 | 
207 | static void
208 | rmtree_helper(struct walk_context *ctx)
209 | {
210 |   if (ctx->is_dir)
211 |     {
212 |       if (unlinkat(ctx->dir_fd, ctx->name, AT_REMOVEDIR) < 0)
213 | 	die("Cannot rmdir %s: %m", ctx->name);
214 |     }
215 |   else
216 |     {
217 |       if (unlinkat(ctx->dir_fd, ctx->name, 0) < 0)
218 | 	die("Cannot unlink %s: %m", ctx->name);
219 |     }
220 | }
221 | 
222 | void
223 | rmtree(char *path)
224 | {
225 |   struct walk_context ctx = { };
226 |   walktree(&ctx, path, rmtree_helper);
227 | }
228 | 
229 | static void
230 | chowntree_helper(struct walk_context *ctx)
231 | {
232 |   if (S_ISREG(ctx->st.st_mode) || S_ISDIR(ctx->st.st_mode) || ctx->keep_special_files)
233 |     {
234 |       if (fchownat(ctx->dir_fd, ctx->name, ctx->chown_uid, ctx->chown_gid, AT_SYMLINK_NOFOLLOW) < 0)
235 | 	die("Cannot chown %s: %m", ctx->name);
236 |     }
237 |   else
238 |     {
239 |       if (unlinkat(ctx->dir_fd, ctx->name, 0) < 0)
240 | 	die("Cannot unlink special file %s: %m", ctx->name);
241 |     }
242 | }
243 | 
244 | void
245 | chowntree(char *path, uid_t uid, gid_t gid, bool keep_special_files)
246 | {
247 |   struct walk_context ctx = {
248 |       .chown_uid = uid,
249 |       .chown_gid = gid,
250 |       .keep_special_files = keep_special_files,
251 |   };
252 |   walktree(&ctx, path, chowntree_helper);
253 | }
254 | 
255 | static int fds_to_keep[4];
256 | static int num_kept_fds;
257 | 
258 | void
259 | keep_fd(int fd)
260 | {
261 |   assert(num_kept_fds < ARRAY_SIZE(fds_to_keep));
262 |   fds_to_keep[num_kept_fds++] = fd;
263 | }
264 | 
265 | static bool
266 | fd_is_kept(int fd)
267 | {
268 |   for (int i=0; i < num_kept_fds; i++)
269 |     if (fds_to_keep[i] == fd)
270 |       return true;
271 |   return false;
272 | }
273 | 
274 | void
275 | close_all_fds(void)
276 | {
277 |   /* Close all file descriptors except 0, 1, 2 */
278 | 
279 |   DIR *dir = opendir("/proc/self/fd");
280 |   if (!dir)
281 |     die("Cannot open /proc/self/fd: %m");
282 |   int dir_fd = dirfd(dir);
283 | 
284 |   struct dirent *e;
285 |   while (e = readdir(dir))
286 |     {
287 |       char *end;
288 |       long int fd = strtol(e->d_name, &end, 10);
289 |       if (*end)
290 | 	continue;
291 |       if (fd >= 0 && fd <= 2 || fd == dir_fd || fd_is_kept(fd))
292 | 	continue;
293 |       close(fd);
294 |     }
295 | 
296 |   closedir(dir);
297 | }
298 | 
299 | /*** Meta-files ***/
300 | 
301 | static FILE *metafile;
302 | 
303 | void
304 | meta_open(const char *name)
305 | {
306 |   if (!strcmp(name, "-"))
307 |     {
308 |       metafile = stdout;
309 |       return;
310 |     }
311 |   if (setfsuid(getuid()) < 0)
312 |     die("Failed to switch FS UID: %m");
313 |   metafile = fopen(name, "w");
314 |   if (setfsuid(geteuid()) < 0)
315 |     die("Failed to switch FS UID back: %m");
316 |   if (!metafile)
317 |     die("Failed to open metafile '%s'",name);
318 |   keep_fd(fileno(metafile));
319 | }
320 | 
321 | void
322 | meta_close(void)
323 | {
324 |   if (metafile && metafile != stdout)
325 |     fclose(metafile);
326 | }
327 | 
328 | void
329 | meta_printf(const char *fmt, ...)
330 | {
331 |   if (!metafile)
332 |     return;
333 | 
334 |   va_list args;
335 |   va_start(args, fmt);
336 |   vfprintf(metafile, fmt, args);
337 |   va_end(args);
338 | }
339 | 


--------------------------------------------------------------------------------
/isolate-check-environment:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Identifies potential sources issues when using isolate.
  4 | #
  5 | #     (c) 2017 Bernard Blackham <bernard@blackham.com.au>
  6 | #     (c) 2022-2025 Martin Mares <mj@ucw.cz>
  7 | #     (c) 2024 Stephan Gomer <me@sadfun.org>
  8 | #
  9 | 
 10 | usage() {
 11 |     cat <<EOT >&2
 12 | Usage: $0 [-q|--quiet] [-e|--execute]
 13 | 
 14 | Use this script to identify sources of run-time variability and other issues on
 15 | Linux machines which may affect isolate.
 16 | 
 17 | See manual page for details.
 18 | EOT
 19 |     exit 2
 20 | }
 21 | 
 22 | # Parse options.
 23 | args=$(getopt -o "ehq" --long "execute,help,quiet" -- "$@") || usage
 24 | eval set -- "$args"
 25 | quiet=
 26 | execute=
 27 | while : ; do
 28 |     case "$1" in
 29 |         -q|--quiet) quiet=1 ; shift ;;
 30 |         -e|--execute) execute=1 ; shift ;;
 31 |         -h|--help) usage ;;
 32 |         --) shift ; break ;;
 33 |         *) usage ;;
 34 |     esac
 35 | done
 36 | [ -n "$*" ] && usage
 37 | 
 38 | # Some helper boilerplate machinery.
 39 | exit_status=0
 40 | if [ -n "$TERM" -a "$TERM" != dumb ] ; then
 41 | 	red=$(tput setaf 1)
 42 | 	green=$(tput setaf 2)
 43 | 	yellow=$(tput setaf 3)
 44 | 	normal=$(tput sgr0)
 45 | else
 46 | 	red=
 47 | 	green=
 48 | 	yellow=
 49 | 	normal=
 50 | fi
 51 | 
 52 | # Return true (0) if we are being quiet.
 53 | quiet() {
 54 |     [ -n "$quiet" ]
 55 | }
 56 | 
 57 | # Print all arguments to stderr as warning.
 58 | warn() {
 59 |     quiet || echo "${yellow}WARNING:${normal}" "$*" >&2
 60 | }
 61 | 
 62 | # Print first argument to stderr as warning, and second argument to stdout as
 63 | # the recommended remedial action, or execute if --execute is given.
 64 | action() {
 65 |     quiet || warn "$1"
 66 |     if [ -n "$execute" ] ; then
 67 |         quiet || echo "+ $2"
 68 |         sh -c "$2"
 69 |     else
 70 |         quiet || echo $2
 71 |     fi
 72 | }
 73 | 
 74 | print_start_check() {
 75 |     quiet && return
 76 |     print_check_status=1
 77 |     echo -n "Checking for $@ ... " >&2
 78 | }
 79 | 
 80 | print_fail() {
 81 |     exit_status=1
 82 |     quiet && return
 83 |     [ -n "$print_check_status" ] && echo "${red}FAIL${normal}" >&2
 84 |     print_check_status=
 85 | }
 86 | 
 87 | print_dubious() {
 88 |     exit_status=1
 89 |     quiet && return
 90 |     [ -n "$print_check_status" ] && echo "${yellow}CAUTION${normal}" >&2
 91 |     print_check_status=
 92 | }
 93 | 
 94 | print_skipped() {
 95 |     quiet && return
 96 |     [ -n "$print_check_status" ] && echo "SKIPPED (not detected)" >&2
 97 |     print_check_status=
 98 | }
 99 | 
100 | print_finish() {
101 |     quiet && return
102 |     [ -n "$print_check_status" ] && echo "${green}PASS${normal}" >&2
103 |     print_check_status=
104 | }
105 | 
106 | # Check that cgroups are enabled.
107 | cgroup_check() {
108 |     local cgroup=$1
109 |     print_start_check "cgroup support for $cgroup"
110 |     if ! test -f "$cg_root/$cgroup" ; then
111 |         print_dubious
112 |         warn "the $cgroup is not present. isolate --cg cannot be used."
113 |     fi
114 |     print_finish
115 | }
116 | 
117 | # Check that cgroups are enabled.
118 | if ! cg_root=$(isolate --print-cg-root 2>/dev/null) ; then
119 |     warn "cgroup root not found. isolate --cg cannot be used."
120 |     exit_status=1
121 | else
122 |     quiet || echo "Using cgroup root: $cg_root"
123 |     cgroup_check cpuset.cpus
124 |     cgroup_check cpuset.mems
125 |     cgroup_check cpu.stat
126 |     cgroup_check cgroup.procs
127 |     cgroup_check memory.events
128 |     cgroup_check memory.max
129 | fi
130 | 
131 | # Check that swap is either disabled or accounted for.
132 | swap_check() {
133 |     print_start_check "swap"
134 |     # If swap is disabled, there is nothing to worry about.
135 |     local swaps
136 |     swaps=$(swapon --noheadings)
137 |     if [ -n "$swaps" ] ; then
138 |         # Swap is enabled.  We had better have the memory.swap support in the memory cgroup.
139 |         if ! test -f "$cg_root/memory.swap.current" ; then
140 |             print_fail
141 |             action \
142 |                 "swap is enabled, but swap accounting is not. isolate will not be able to enforce memory limits." \
143 |                 "swapoff -a"
144 |         else
145 |             print_dubious
146 |             warn "swap is enabled, and although accounted for, may still give run-time variability under memory pressure."
147 |         fi
148 |     fi
149 |     print_finish
150 | }
151 | swap_check
152 | 
153 | # Check that SMT is disabled.
154 | smt_check() {
155 |     print_start_check "simultaneous multithreading"
156 |     local val
157 |     if val="$(cat /sys/devices/system/cpu/smt/active 2>/dev/null)" ; then
158 |         if [ "$val" -ne 0 ] ; then
159 |             print_fail
160 | 
161 |             val="$(cat /sys/devices/system/cpu/smt/control)"
162 |             if [ "$val" != "notimplemented" ] ; then
163 |                 action \
164 |                     "simultaneous multithreading is enabled." \
165 |                     "echo off > /sys/devices/system/cpu/smt/control"
166 |             else
167 |                 warn "SMT is enabled, but runtime SMT toggling is not supported. Add 'nosmt=1' to the kernel command line."
168 |             fi
169 |         fi
170 |     else
171 |         print_skipped
172 |     fi
173 |     print_finish
174 | }
175 | smt_check
176 | 
177 | # Check that CPU frequency scaling is disabled.
178 | cpufreq_check() {
179 |     print_start_check "CPU frequency scaling"
180 |     local anycpus policy
181 |     anycpus=
182 |     # Ensure cpufreq governor is set to performance on all CPUs
183 |     for cpufreq_file in $(find /sys/devices/system/cpu/cpufreq/ -name scaling_governor) ; do
184 |         if policy=$(cat $cpufreq_file 2>/dev/null) ; then
185 |             if [ "$policy" != "performance" ] ; then
186 |                 print_fail
187 |                 action \
188 |                     "cpufreq governor set to '$policy', but 'performance' would be better" \
189 |                     "echo performance > $cpufreq_file"
190 |             fi
191 |         fi
192 |         anycpus=1
193 |     done
194 |     [ -z "$anycpus" ] && print_skipped
195 |     print_finish
196 | }
197 | cpufreq_check
198 | 
199 | # Check that Intel frequency boost is disabled
200 | intel_boost_check() {
201 |     print_start_check "Intel frequency boost"
202 |     local val
203 |     if val=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo 2>/dev/null) ; then
204 |         if [ "$val" -ne 1 ] ; then
205 |             print_fail
206 |             action \
207 |                 "frequency boosting is enabled." \
208 |                 "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo"
209 |         fi
210 |     else
211 |         print_skipped
212 |     fi
213 |     print_finish
214 | }
215 | intel_boost_check
216 | 
217 | # Check that general frequency boost is disabled
218 | general_boost_check() {
219 |     print_start_check "general frequency boost"
220 |     local val
221 |     if val=$(cat /sys/devices/system/cpu/cpufreq/boost 2>/dev/null) ; then
222 |         if [ "$val" -ne 0 ] ; then
223 |             print_fail
224 |             action \
225 |                 "frequency boosting is enabled." \
226 |                 "echo 0 > /sys/devices/system/cpu/cpufreq/boost"
227 |         fi
228 |     else
229 |         print_skipped
230 |     fi
231 |     print_finish
232 | }
233 | general_boost_check
234 | 
235 | # Check that address space layout randomisation is disabled.
236 | aslr_check() {
237 |     print_start_check "kernel address space randomisation"
238 |     local val
239 |     if val=$(cat /proc/sys/kernel/randomize_va_space 2>/dev/null) ; then
240 |         if [ "$val" -ne 0 ] ; then
241 |             print_fail
242 |             action \
243 |                 "address space randomisation is enabled." \
244 |                 "echo 0 > /proc/sys/kernel/randomize_va_space"
245 |         fi
246 |     else
247 |         print_skipped
248 |     fi
249 |     print_finish
250 | }
251 | aslr_check
252 | 
253 | # Check that transparent huge-pages are disabled, as this leads to
254 | # non-determinism depending on whether the kernel can allocate 2 MiB pages or
255 | # not.
256 | thp_check() {
257 |     print_start_check "transparent hugepage support"
258 |     local val
259 |     if val=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null) ; then
260 |         case $val in
261 |             *'[never]'*) ;;
262 |             *) print_fail
263 |                action \
264 |                     "transparent hugepages are enabled." \
265 |                     "echo never > /sys/kernel/mm/transparent_hugepage/enabled" ;;
266 |         esac
267 |     fi
268 |     if val=$(cat /sys/kernel/mm/transparent_hugepage/defrag 2>/dev/null) ; then
269 |         case $val in
270 |             *'[never]'*) ;;
271 |             *) print_fail
272 |                action \
273 |                     "transparent hugepage defrag is enabled." \
274 |                     "echo never > /sys/kernel/mm/transparent_hugepage/defrag" ;;
275 |         esac
276 |     fi
277 |     if val=$(cat /sys/kernel/mm/transparent_hugepage/khugepaged/defrag 2>/dev/null) ; then
278 |         if [ "$val" -ne 0 ] ; then
279 |             print_fail
280 |             action \
281 |                 "khugepaged defrag is enabled." \
282 |                 "echo 0 > /sys/kernel/mm/transparent_hugepage/khugepaged/defrag"
283 |         fi
284 |     fi
285 |     print_finish
286 | }
287 | thp_check
288 | 
289 | # Piping of core dumps to programs can make program crashes significantly
290 | # slower. Unfortunetely, dumps to pipes are not affected by RLIMIT_CORE,
291 | # so we cannot easily disable them inside the sandbox.
292 | core_check() {
293 |     print_start_check "core file pattern"
294 |     local val
295 |     if val="$(cat /proc/sys/kernel/core_pattern)" ; then
296 |         if [ "${val:0:1}" = '|' ] ; then
297 |             print_fail
298 |             action \
299 |                 "core files are piped to a program." \
300 |                 "echo core >/proc/sys/kernel/core_pattern"
301 |         fi
302 |     else
303 |         print_skipped
304 |     fi
305 |     print_finish
306 | }
307 | core_check
308 | 
309 | # Without protected_hardlinks, the user running Isolate could trick it into
310 | # changing ownership of unrelated files.
311 | hardlink_check() {
312 |     print_start_check "hard link protection"
313 |     local val
314 |     if val="$(cat /proc/sys/fs/protected_hardlinks)" ; then
315 |         if [ $val = 0 ] ; then
316 |             print_fail
317 |             action \
318 |                 "hardlink protection is disabled." \
319 |                 "echo 1 >/proc/sys/fs/protected_hardlinks"
320 |         fi
321 |     else
322 |         print_skipped
323 |     fi
324 |     print_finish
325 | }
326 | hardlink_check
327 | 
328 | # Check for an Intel CPU with both P-cores and E-cores.
329 | # At the moment, we have no automatic remedy.
330 | asymmetric_core_check() {
331 |     print_start_check "asymmetric cores"
332 |     if [ -d /sys/devices/cpu_atom -a -d /sys/devices/cpu_core ] ; then
333 |         print_dubious
334 |         quiet || warn "the CPU has a combination of P-cores and E-cores, core pinning should be used."
335 |     fi
336 |     print_finish
337 | }
338 | asymmetric_core_check
339 | 
340 | 
341 | exit $exit_status
342 | 


--------------------------------------------------------------------------------
/rules.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *	Process Isolator -- Rules
  3 |  *
  4 |  *	(c) 2012-2025 Martin Mares <mj@ucw.cz>
  5 |  *	(c) 2012-2014 Bernard Blackham <bernard@blackham.com.au>
  6 |  */
  7 | 
  8 | #include "isolate.h"
  9 | 
 10 | #include <errno.h>
 11 | #include <fcntl.h>
 12 | #include <limits.h>
 13 | #include <mntent.h>
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | #include <sys/capability.h>
 18 | #include <sys/mount.h>
 19 | #include <sys/quota.h>
 20 | #include <sys/stat.h>
 21 | #include <sys/syscall.h>
 22 | #include <sys/vfs.h>
 23 | #include <unistd.h>
 24 | 
 25 | /*** Environment rules ***/
 26 | 
 27 | struct env_rule {
 28 |   char *var;			// Variable to match
 29 |   char *val;			// ""=clear, NULL=inherit
 30 |   int var_len;
 31 |   struct env_rule *next;
 32 | };
 33 | 
 34 | static struct env_rule *first_env_rule;
 35 | static struct env_rule **last_env_rule = &first_env_rule;
 36 | 
 37 | static struct env_rule default_env_rules[] = {
 38 |   { .var = "LIBC_FATAL_STDERR_", .val = "1", .var_len = 18 },
 39 | };
 40 | 
 41 | int
 42 | set_env_action(char *a0)
 43 | {
 44 |   struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1);
 45 |   char *a = (char *)(r+1);
 46 |   strcpy(a, a0);
 47 | 
 48 |   char *sep = strchr(a, '=');
 49 |   if (sep == a)
 50 |     return 0;
 51 |   r->var = a;
 52 |   if (sep)
 53 |     {
 54 |       *sep++ = 0;
 55 |       r->val = sep;
 56 |     }
 57 |   else
 58 |     r->val = NULL;
 59 |   *last_env_rule = r;
 60 |   last_env_rule = &r->next;
 61 |   r->next = NULL;
 62 |   return 1;
 63 | }
 64 | 
 65 | static int
 66 | match_env_var(char *env_entry, struct env_rule *r)
 67 | {
 68 |   if (strncmp(env_entry, r->var, r->var_len))
 69 |     return 0;
 70 |   return (env_entry[r->var_len] == '=');
 71 | }
 72 | 
 73 | static void
 74 | apply_env_rule(char **env, int *env_sizep, struct env_rule *r)
 75 | {
 76 |   // First remove the variable if already set
 77 |   int pos = 0;
 78 |   while (pos < *env_sizep && !match_env_var(env[pos], r))
 79 |     pos++;
 80 |   if (pos < *env_sizep)
 81 |     {
 82 |       (*env_sizep)--;
 83 |       env[pos] = env[*env_sizep];
 84 |       env[*env_sizep] = NULL;
 85 |     }
 86 | 
 87 |   // What is the new value?
 88 |   char *new;
 89 |   if (r->val)
 90 |     {
 91 |       if (!r->val[0])
 92 | 	return;
 93 |       new = xmalloc(r->var_len + 1 + strlen(r->val) + 1);
 94 |       sprintf(new, "%s=%s", r->var, r->val);
 95 |     }
 96 |   else
 97 |     {
 98 |       pos = 0;
 99 |       while (environ[pos] && !match_env_var(environ[pos], r))
100 | 	pos++;
101 |       if (!(new = environ[pos]))
102 | 	return;
103 |     }
104 | 
105 |   // Add it at the end of the array
106 |   env[(*env_sizep)++] = new;
107 |   env[*env_sizep] = NULL;
108 | }
109 | 
110 | char **
111 | setup_environment(void)
112 | {
113 |   // Link built-in rules with user rules
114 |   for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--)
115 |     {
116 |       default_env_rules[i].next = first_env_rule;
117 |       first_env_rule = &default_env_rules[i];
118 |     }
119 | 
120 |   // Scan the original environment
121 |   char **orig_env = environ;
122 |   int orig_size = 0;
123 |   while (orig_env[orig_size])
124 |     orig_size++;
125 | 
126 |   // For each rule, reserve one more slot and calculate length
127 |   int num_rules = 0;
128 |   for (struct env_rule *r = first_env_rule; r; r=r->next)
129 |     {
130 |       num_rules++;
131 |       r->var_len = strlen(r->var);
132 |     }
133 | 
134 |   // Create a new environment
135 |   char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *));
136 |   int size;
137 |   if (pass_environ)
138 |     {
139 |       memcpy(env, environ, orig_size * sizeof(char *));
140 |       size = orig_size;
141 |     }
142 |   else
143 |     size = 0;
144 |   env[size] = NULL;
145 | 
146 |   // Apply the rules one by one
147 |   for (struct env_rule *r = first_env_rule; r; r=r->next)
148 |     apply_env_rule(env, &size, r);
149 | 
150 |   // Return the new env and pass some gossip
151 |   if (verbose > 1)
152 |     {
153 |       fprintf(stderr, "Passing environment:\n");
154 |       for (int i=0; env[i]; i++)
155 | 	fprintf(stderr, "\t%s\n", env[i]);
156 |     }
157 |   return env;
158 | }
159 | 
160 | /*** Directory rules ***/
161 | 
162 | struct dir_rule {
163 |   char *inside;			// A relative path
164 |   char *outside;		// This can be an absolute path or a relative path starting with "./"
165 |   unsigned int flags;		// DIR_FLAG_xxx
166 |   struct dir_rule *next;
167 | };
168 | 
169 | enum dir_rule_flags {
170 |   DIR_FLAG_RW = 1,
171 |   DIR_FLAG_NOEXEC = 2,
172 |   DIR_FLAG_FS = 4,
173 |   DIR_FLAG_MAYBE = 8,
174 |   DIR_FLAG_DEV = 16,
175 |   DIR_FLAG_TMP = 32,
176 |   DIR_FLAG_NOREC = 64,
177 |   DIR_FLAG_DEFAULT = 1U << 15,	// Used internally
178 |   DIR_FLAG_DISABLED = 1U << 16,	// Used internally
179 | };
180 | 
181 | static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev", "tmp", "norec" };
182 | 
183 | static struct dir_rule *first_dir_rule;
184 | static struct dir_rule **last_dir_rule = &first_dir_rule;
185 | 
186 | static char *
187 | sanitize_dir_path(char *path)
188 | {
189 |   // Strip leading slashes
190 |   while (*path == '/')
191 |     path++;
192 |   if (!*path)
193 |     return NULL;
194 | 
195 |   // Check for ".." components
196 |   char *p = path;
197 |   while (*p)
198 |     {
199 |       char *next = strchr(p, '/');
200 |       if (!next)
201 | 	next = p + strlen(p);
202 | 
203 |       int len = next - p;
204 |       if (len == 2 && !memcmp(p, "..", 2))
205 | 	return NULL;
206 | 
207 |       p = *next ? next+1 : next;
208 |     }
209 | 
210 |   return path;
211 | }
212 | 
213 | static int
214 | add_dir_rule(char *in, char *out, unsigned int flags)
215 | {
216 |   // Make sure that "in" does not try to escape the box
217 |   in = sanitize_dir_path(in);
218 |   if (!in)
219 |     return 0;
220 | 
221 |   // Override an existing rule
222 |   struct dir_rule *r;
223 |   for (r = first_dir_rule; r; r = r->next)
224 |     if (!strcmp(r->inside, in))
225 |       break;
226 | 
227 |   // Add a new rule
228 |   if (!r)
229 |     {
230 |       r = xmalloc(sizeof(*r));
231 |       r->inside = in;
232 |       *last_dir_rule = r;
233 |       last_dir_rule = &r->next;
234 |       r->next = NULL;
235 |     }
236 |   r->outside = out;
237 |   r->flags = flags;
238 |   return 1;
239 | }
240 | 
241 | static unsigned int
242 | parse_dir_option(char *opt)
243 | {
244 |   for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++)
245 |     if (!strcmp(opt, dir_flag_names[i]))
246 |       return 1U << i;
247 |   die("Unknown directory option %s", opt);
248 | }
249 | 
250 | static int
251 | set_dir_action_ext(char *arg, unsigned int ext_flags)
252 | {
253 |   arg = xstrdup(arg);
254 | 
255 |   char *colon = strchr(arg, ':');
256 |   unsigned int flags = ext_flags;
257 |   while (colon)
258 |     {
259 |       *colon++ = 0;
260 |       char *next = strchr(colon, ':');
261 |       if (next)
262 | 	*next = 0;
263 |       flags |= parse_dir_option(colon);
264 |       colon = next;
265 |     }
266 | 
267 |   char *eq = strchr(arg, '=');
268 |   if (eq)
269 |     *eq++ = 0;
270 | 
271 |   if ((flags & DIR_FLAG_FS) && (flags & DIR_FLAG_TMP))
272 |     return 0;
273 | 
274 |   if (flags & DIR_FLAG_FS)
275 |     {
276 |       if (!eq || strchr(eq, '/'))
277 | 	return 0;
278 |       return add_dir_rule(arg, eq, flags);
279 |     }
280 |   else if (flags & DIR_FLAG_TMP)
281 |     {
282 |       if (eq)
283 | 	return 0;
284 |       /*
285 |        *  Construct an outside temporary directory, which will be later
286 |        *  chowned to box_uid. The hierarchy of these directories is intentionally
287 |        *  flat, so that we avoid writing to a directory which might have already
288 |        *  tampered with in a previous run of the sandbox.
289 |        */
290 |       char out[1024];
291 |       snprintf(out, sizeof(out), "./tmp/%s", arg);
292 |       for (char *p = out + strlen("./tmp/"); *p; p++)
293 | 	if (*p == '/')
294 | 	  *p = ':';		// This is safe, there were no colons in "out"
295 |       return add_dir_rule(arg, xstrdup(out), flags | DIR_FLAG_RW);
296 |     }
297 |   else if (eq)
298 |     {
299 |       if (!eq[0])
300 | 	return add_dir_rule(arg, NULL, flags);
301 |       if (eq[0] != '/' && strncmp(eq, "./", 2))
302 | 	return 0;
303 |       return add_dir_rule(arg, eq, flags);
304 |     }
305 |   else
306 |     {
307 |       char *out = xmalloc(1 + strlen(arg) + 1);
308 |       sprintf(out, "/%s", arg);
309 |       return add_dir_rule(arg, out, flags);
310 |     }
311 | }
312 | 
313 | int
314 | set_dir_action(char *arg)
315 | {
316 |   return set_dir_action_ext(arg, 0);
317 | }
318 | 
319 | static int
320 | set_dir_action_default(char *arg)
321 | {
322 |   return set_dir_action_ext(arg, DIR_FLAG_DEFAULT);
323 | }
324 | 
325 | void
326 | init_dir_rules(void)
327 | {
328 |   set_dir_action_default("box=./box:rw");
329 |   set_dir_action_default("bin");
330 |   set_dir_action_default("dev:dev");
331 |   set_dir_action_default("lib");
332 |   set_dir_action_default("lib64:maybe");
333 |   set_dir_action_default("proc=proc:fs");
334 |   set_dir_action_default("tmp:tmp");
335 |   set_dir_action_default("usr");
336 | }
337 | 
338 | static void
339 | set_cap_sys_admin(void)
340 | {
341 |   cap_t caps;
342 |   if (!(caps = cap_get_proc()))
343 |     die("Cannot get capabilities: %m");
344 | 
345 |   cap_value_t cap_list[] = { CAP_SYS_ADMIN };
346 |   if (cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_SET) < 0)
347 |     die("Cannot modify capabilities");
348 | 
349 |   if (cap_set_proc(caps) < 0)
350 |     die("Cannot set capabilities: %m");
351 | 
352 |   cap_free(caps);
353 | }
354 | 
355 | void
356 | apply_dir_rules(int with_defaults)
357 | {
358 |   /*
359 |    * Before mounting anything, we create all mount points inside the box.
360 |    * This is necessary to avoid bypassing directory permissions. If you
361 |    * want nested binds, you have to create the mount points explicitly.
362 |    */
363 |   for (struct dir_rule *r = first_dir_rule; r; r=r->next)
364 |     {
365 |       if (!with_defaults && (r->flags & DIR_FLAG_DEFAULT))
366 |         continue;
367 | 
368 |       char *in = r->inside;
369 |       char *out = r->outside;
370 | 
371 |       if (!out)
372 | 	{
373 | 	  msg("Not binding anything on %s\n", in);
374 | 	  r->flags |= DIR_FLAG_DISABLED;
375 | 	  continue;
376 | 	}
377 | 
378 |       if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out))
379 | 	{
380 | 	  msg("Not binding %s on %s (does not exist)\n", out, r->inside);
381 | 	  r->flags |= DIR_FLAG_DISABLED;
382 | 	  continue;
383 | 	}
384 | 
385 |       char root_in[1024];
386 |       snprintf(root_in, sizeof(root_in), "root/%s", in);
387 |       make_dir(root_in);
388 |     }
389 | 
390 |   for (struct dir_rule *r = first_dir_rule; r; r=r->next)
391 |     {
392 |       if (r->flags & DIR_FLAG_DISABLED)
393 | 	continue;
394 |       if (!with_defaults && (r->flags & DIR_FLAG_DEFAULT))
395 |         continue;
396 | 
397 |       char *in = r->inside;
398 |       char *out = r->outside;
399 |       char root_in[1024];
400 |       snprintf(root_in, sizeof(root_in), "root/%s", in);
401 | 
402 |       if (r->flags & DIR_FLAG_TMP)
403 | 	{
404 | 	  make_dir(out);
405 | 	  if (chown(out, box_uid, box_gid) < 0)
406 | 	    die("Cannot chown %s: %m", out);
407 | 	  if (chmod(out, 0700) < 0)
408 | 	    die("Cannot chmod %s: %m", out);
409 | 	}
410 | 
411 |       unsigned long mount_flags = 0;
412 |       if (!(r->flags & DIR_FLAG_RW))
413 | 	mount_flags |= MS_RDONLY;
414 |       if (r->flags & DIR_FLAG_NOEXEC)
415 | 	mount_flags |= MS_NOEXEC;
416 |       if (!(r->flags & DIR_FLAG_DEV))
417 | 	mount_flags |= MS_NODEV;
418 | 
419 |       if (r->flags & DIR_FLAG_FS)
420 | 	{
421 | 	  msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags);
422 | 	  if (mount("none", root_in, out, mount_flags, "") < 0)
423 | 	    die("Cannot mount %s on %s: %m", out, in);
424 | 	  if (!strcmp(in, "proc"))
425 | 	    {
426 | 	      // If we are mounting procfs, add hidepid=2, so that only the processes
427 | 	      // of the same user are visible. This has to be done as a remount.
428 | 	      if (mount("none", root_in, out, MS_REMOUNT | mount_flags, "hidepid=2") < 0)
429 | 		die("Cannot re-mount proc with hidepid option: %m");
430 | 	    }
431 | 	}
432 |       else
433 | 	{
434 | 	  mount_flags |= MS_BIND | MS_NOSUID;
435 | 	  if (!(r->flags & DIR_FLAG_NOREC))
436 | 	    mount_flags |= MS_REC;
437 | 	  msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags);
438 | 
439 | 	  /*
440 | 	   *  This is tricky. We cannot run mount() with root privileges, since
441 | 	   *  it could be used to bypass access control if the mounted path
442 | 	   *  contains elements inaccessible to the user running isolate.
443 | 	   *
444 | 	   *  We switch effective UID and GID back to the calling user (which clears
445 | 	   *  all capabilities, but keeps them in the permitted set) and then
446 | 	   *  enable CAP_SYS_ADMIN. So we have CAP_SYS_ADMIN (needed for mount),
447 | 	   *  but not CAP_DAC_OVERRIDE (which allows to bypass permission checks).
448 | 	   */
449 | 
450 | 	  if (setresuid(orig_uid, orig_uid, 0) < 0 ||
451 | 	      setresgid(orig_gid, orig_gid, 0) < 0)
452 | 	    die("Cannot switch UID and GID: %m");
453 | 
454 | 	  set_cap_sys_admin();
455 | 
456 | 	  // Most mount flags need remount to work
457 | 	  if (mount(out, root_in, "none", mount_flags, "") < 0 ||
458 | 	      mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0)
459 | 	    die("Cannot mount %s on %s: %m", out, in);
460 | 
461 | 	  if (setresuid(orig_uid, 0, orig_uid) < 0 ||
462 | 	      setresgid(orig_gid, 0, orig_gid) < 0)
463 | 	    die("Cannot switch UID and GID: %m");
464 | 	}
465 |     }
466 | }
467 | 
468 | /*** Disk quotas ***/
469 | 
470 | static void
471 | quotactl_error(void)
472 | {
473 |   // This errno has an outstandingly unhelpful message of "no such process".
474 |   if (errno == ESRCH)
475 |     die("Cannot set disk quota: quotas have not been enabled for this filesystem");
476 |   die("Cannot set disk quota: %m");
477 | }
478 | 
479 | void
480 | set_quota(void)
481 | {
482 |   if (!block_quota)
483 |     return;
484 | 
485 |   struct dqblk dq = {
486 |     .dqb_bhardlimit = block_quota,
487 |     .dqb_bsoftlimit = block_quota,
488 |     .dqb_ihardlimit = inode_quota,
489 |     .dqb_isoftlimit = inode_quota,
490 |     .dqb_valid = QIF_LIMITS,
491 |   };
492 |   void *dq_ptr = (void*)&dq;
493 |   int quota_op = QCMD(Q_SETQUOTA, USRQUOTA);
494 | 
495 |   int cwd_fd = open(".", O_DIRECTORY | O_PATH);
496 |   if (cwd_fd < 0)
497 |     die("open: %m");
498 | 
499 |   if (syscall(SYS_quotactl_fd, cwd_fd, quota_op, box_uid, dq_ptr) < 0)
500 |     quotactl_error();
501 | 
502 |   close(cwd_fd);
503 | 
504 |   msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota);
505 | }
506 | 


--------------------------------------------------------------------------------
/isolate.1.txt:
--------------------------------------------------------------------------------
  1 | ISOLATE(1)
  2 | ==========
  3 | 
  4 | NAME
  5 | ----
  6 | isolate - Isolate a process using Linux Containers
  7 | 
  8 | SYNOPSIS
  9 | --------
 10 | *isolate* 'options' *--init*
 11 | 
 12 | *isolate* 'options' *--run* +--+ 'program' 'arguments'
 13 | 
 14 | *isolate* 'options' *--cleanup*
 15 | 
 16 | DESCRIPTION
 17 | -----------
 18 | Run 'program' within a sandbox, so that it cannot communicate with the
 19 | outside world and its resource consumption is limited. This can be used
 20 | for example in a programming contest to run untrusted programs submitted
 21 | by contestants in a controlled environment.
 22 | 
 23 | The sandbox is used in the following way:
 24 | 
 25 | * Run *isolate --init*, which initializes the sandbox, creates its working directory and
 26 | prints its name to the standard output. If the sandbox already existed, it
 27 | is reset.
 28 | 
 29 | * Populate the directory with the executable file of the program and its
 30 | input files.
 31 | 
 32 | * Call *isolate --run* to run the program. A single line describing the
 33 | status of the program is written to the standard error stream.
 34 | 
 35 | * Fetch the output of the program from the directory.
 36 | 
 37 | * Run *isolate --cleanup* to remove temporary files. Does nothing if the sandbox
 38 | was already cleaned up.
 39 | 
 40 | Please note that by default, the program is not allowed to start multiple
 41 | processes of threads. If you need that, turn on the control group mode
 42 | (see below).
 43 | 
 44 | BASIC OPTIONS
 45 | -------------
 46 | *-b, --box-id=*'id'::
 47 | 	When you run multiple sandboxes in parallel, you have to assign unique
 48 | 	IDs to them by this option. See the discussion on UIDs in the INSTALLATION
 49 | 	section. The ID defaults to 0.
 50 | 
 51 | *-M, --meta=*'file'::
 52 | 	Output meta-data on the execution of the program to a given file.
 53 | 	See below for syntax of the meta-files.
 54 | 
 55 | *-i, --stdin=*'file'::
 56 | 	Redirect standard input from 'file'. The 'file' has to be accessible
 57 | 	inside the sandbox (which means that the sandboxed program can manipulate
 58 | 	it arbitrarily). If not specified, standard input is inherited from the
 59 | 	parent process.
 60 | 
 61 | *-o, --stdout=*'file'::
 62 | 	Redirect standard output to 'file'. The 'file' has to be accessible
 63 | 	inside the sandbox (which means that the sandboxed program can manipulate
 64 | 	it arbitrarily). If not specified, standard output is inherited from the
 65 | 	parent process and the sandbox manager does not write anything to it.
 66 | 
 67 | *-r, --stderr=*'file'::
 68 | 	Redirect standard error output to 'file'. The 'file' has to be accessible
 69 | 	inside the sandbox (which means that the sandboxed program can manipulate
 70 | 	it arbitrarily). If not specified, standard error output is inherited from the
 71 | 	parent process. See also *--stderr-to-stdout*.
 72 | 
 73 | *--stderr-to-stdout*::
 74 | 	Redirect standard error output to standard output. This is performed after
 75 | 	the standard output is redirected by *--stdout*. Mutually exclusive with *--stderr*.
 76 | 
 77 | *-c, --chdir=*'dir'::
 78 | 	Change directory to 'dir' before executing the program. This path must be
 79 | 	relative to the root of the sandbox.
 80 | 
 81 | *-v, --verbose*::
 82 | 	Tell the sandbox manager to be verbose and report on what is going on.
 83 | 	Using *-v* multiple times produces even more jabber.
 84 | 
 85 | *-s, --silent*::
 86 | 	Tell the sandbox manager to keep silence. No status messages are printed
 87 | 	to stderr except for fatal errors of the sandbox itself. The combination of
 88 | 	*--verbose* and *--silent* has an undefined effect.
 89 | 
 90 | *--wait*::
 91 | 	Multiple instances of Isolate cannot manage the same sandbox simultaneously.
 92 | 	If you attempt to do that, the new instance refuses to run. With this option,
 93 | 	the new instance waits for the other instance to finish.
 94 | 
 95 | LIMITS
 96 | ------
 97 | The following options can limit system resources consumed by the program.
 98 | 
 99 | *-m, --mem=*'size'::
100 | 	Limit address space of the program to 'size' kilobytes. If more processes
101 | 	are allowed, this applies to each of them separately. If this limit is reached,
102 | 	further memory allocations fail (e.g., malloc returns NULL).
103 | 
104 | *-t, --time=*'time'::
105 | 	Limit run time of the program to 'time' seconds. Fractional numbers are allowed.
106 | 	Time in which the OS assigns the processor to other tasks is not counted.
107 | 	If this limit is exceeded, the program is killed (after *--extra-time*, if set).
108 | 
109 | *-w, --wall-time=*'time'::
110 | 	Limit wall-clock time to 'time' seconds. Fractional values are allowed.
111 | 	This clock measures the time from the start of the program to its exit,
112 | 	so it does not stop when the program has lost the CPU or when it is waiting
113 | 	for an external event. We recommend to use *--time* as the main limit,
114 | 	but set *--wall-time* to a much higher value as a precaution against
115 | 	sleeping programs.
116 | 	If this limit is exceeded, the program is killed.
117 | 
118 | *-x, --extra-time=*'time'::
119 | 	When the *--time* limit is exceeded, do not kill the program immediately,
120 | 	but wait until *--extra-time* seconds elapse since the start of the program.
121 | 	This allows one to report the real execution time, even if it exceeds the limit
122 | 	slightly. Fractional numbers are allowed.
123 | 
124 | *-k, --stack=*'size'::
125 | 	Limit process stack to 'size' kilobytes. By default, the whole address
126 | 	space is available for the stack, but it is subject to the *--mem* limit.
127 | 	If this limit is exceeded, the program receives the SIGSEGV signal.
128 | 
129 | *-n, --open-files=*'max'::
130 | 	Limit number of open files to 'max'. The default value is 64. Setting this
131 | 	option to 0 will result in unlimited open files.
132 | 	If this limit is reached, system calls creating file descriptors fail
133 | 	with error EMFILE.
134 | 
135 | *-f, --fsize=*'size'::
136 | 	Limit size of each file created (or modified) by the program to 'size' kilobytes.
137 | 	In most cases, it is better to restrict overall disk usage by a disk quota
138 | 	(see below). This option can help in cases when quotas are not enabled
139 | 	on the underlying filesystem.
140 | 	If this limit is reached, system calls expanding files fail with error
141 | 	EFBIG and the program receives the SIGXFSZ signal.
142 | 
143 | *-q, --quota=*'blocks'*,*'inodes'::
144 | 	Set disk quota to a given number of blocks and inodes. This requires the
145 | 	filesystem to be mounted with support for quotas. Unlike other options,
146 | 	this one must be given to *isolate --init*. Please note that this
147 | 	currently works only on the ext family of filesystems (other filesystems
148 | 	use other interfaces for setting quotas).
149 | 	If the quota is reached, system calls expanding files fail with error EDQUOT.
150 | 
151 | *--core=*'size'::
152 | 	Limit size of core files created when a process crashes to 'size' kilobytes.
153 | 	Defaults to zero, meaning that no core files are produced inside the sandbox.
154 | 
155 | *-p, --processes*[*=*'max']::
156 | 	Permit the program to create up to 'max' processes and/or threads. Please
157 | 	keep in mind that time and memory limit do not work with multiple processes
158 | 	unless you enable the control group mode. If 'max' is not given, an arbitrary
159 | 	number of processes can be run. By default, only one process is permitted.
160 | 	If this limit is exceeded, system calls creating processes fail with error
161 | 	EAGAIN.
162 | 
163 | ENVIRONMENT RULES
164 | -----------------
165 | UNIX processes normally inherit all environment variables from their parent. The
166 | sandbox however passes only those variables which are explicitly requested by
167 | environment rules:
168 | 
169 | *-E, --env=*'var'::
170 | 	Inherit the variable 'var' from the parent.
171 | 
172 | *-E, --env=*'var'*=*'value'::
173 | 	Set the variable 'var' to 'value'. When the 'value' is empty, the
174 | 	variable is removed from the environment.
175 | 
176 | *-e, --full-env*::
177 | 	Inherit all variables from the parent.
178 | 
179 | The rules are applied in the order in which they were given, except for
180 | *--full-env*, which is applied first.
181 | 
182 | The list of rules is automatically initialized with *-ELIBC_FATAL_STDERR_=1*.
183 | 
184 | DIRECTORY RULES
185 | ---------------
186 | The sandboxed process gets its own filesystem namespace, which contains only subtrees
187 | requested by directory rules:
188 | 
189 | *-d, --dir=*'in'*=*'out'[*:*'options']::
190 | 	Bind the directory 'out' as seen by the caller to the path 'in' inside the sandbox.
191 | 	If there already was a directory rule for 'in', it is replaced.
192 | 
193 | *-d, --dir=*'dir'[*:*'options']::
194 | 	Bind the directory +/+'dir' to 'dir' inside the sandbox.
195 | 	If there already was a directory rule for 'in', it is replaced.
196 | 
197 | *-d, --dir=*'in'*=*::
198 | 	Remove a directory rule for the path 'in' inside the sandbox.
199 | 
200 | By default, all directories are bound read-only and restricted (no devices,
201 | no setuid binaries). This behavior can be modified using the 'options':
202 | 
203 | *rw*::
204 | 	Allow read-write access.
205 | 
206 | *dev*::
207 | 	Allow access to character and block devices.
208 | 
209 | *noexec*::
210 | 	Disallow execution of binaries.
211 | 
212 | *maybe*::
213 | 	Silently ignore the rule if the directory to be bound does not exist.
214 | 
215 | *fs*::
216 | 	Instead of binding a directory, mount a device-less filesystem called 'in'.
217 | 	For example, this can be 'proc' or 'sysfs'.
218 | 
219 | *tmp*::
220 | 	Bind a freshly created temporary directory writeable for the sandbox user.
221 | 	Accepts no 'out', implies *rw*.
222 | 
223 | *norec*::
224 | 	Do not bind recursively. Without this option, mount points in the outside
225 | 	directory tree are automatically propagated to the sandbox.
226 | 
227 | Unless *--no-default-dirs* is specified, the default set of directory rules binds +/bin+,
228 | +/dev+ (with devices allowed), +/lib+, +/lib64+ (if it exists), and +/usr+. It also binds
229 | the working directory to +/box+ (read-write), mounts the proc filesystem at +/proc+, and
230 | creates a temporary directory +/tmp+.
231 | 
232 | *-D, --no-default-dirs*::
233 | 	Do not bind the default set of directories. Care has to be taken to specify
234 | 	the correct set of rules (using *--dir*) for the executed program to run
235 | 	correctly. In particular, +/box+ has to be bound.
236 | 
237 | The rules are executed in the order in which they are given. Default rules come before
238 | all user rules. When a rule is replaced, it retains the original position
239 | in the order. This matters when one rule's 'in' is a sub-directory of another
240 | rule's 'in'. For example if you first bind to 'a' and then to 'a/b', it will work as
241 | expected, but a sub-directory 'b' must have existed in the directory bound to 'a' (isolate
242 | never creates subdirectories in bound directories for security reasons). If the
243 | order is 'a/b' before 'a', then the directory bound to 'a/b' becomes invisible
244 | by the later binding on 'a'.
245 | 
246 | CONTROL GROUPS
247 | --------------
248 | Isolate can make use of system control groups provided by the kernel
249 | to constrain programs consisting of multiple processes. Please note
250 | that this feature needs special system setup described in the INSTALLATION
251 | section.
252 | 
253 | *--cg*::
254 | 	Enable use of control groups. This should be specified with *--init*,
255 | 	*--run* and *--cleanup*.
256 | 
257 | *--cg-mem=*'size'::
258 | 	Limit total memory usage by the whole control group to 'size' kilobytes.
259 | 	This should be specified with *--run*.
260 | 	Effect of reaching this limit depends on circumstances.
261 | 	If it happens during memory allocation, the allocation can fail or memory
262 | 	can be over-committed by the kernel.
263 | 	If it happens when handling a page fault, the whole process is killed
264 | 	by the OOM killer with the SIGSEGV signal.
265 | 
266 | *--print-cg-root*::
267 | 	Print the root of the control group hierarchy in */sys/* and exit.
268 | 	This is used by the *isolate-check-environment* script.
269 | 
270 | SPECIAL OPTIONS
271 | ---------------
272 | The following options can be useful in special cases.
273 | 
274 | *--share-net*::
275 | 	By default, isolate creates a new network namespace for its child process.
276 | 	This namespace contains no network devices except for a per-namespace loopback.
277 | 	This prevents the program from communicating with the outside world. If you want
278 | 	to permit communication, you can use this switch to keep the child process
279 | 	in parent's network namespace.
280 | 
281 | *--inherit-fds*::
282 | 	By default, isolate closes all file descriptors passed from its parent
283 | 	except for descriptors 0, 1, and 2.
284 | 	This prevents unintentional descriptor leaks. In some cases, passing extra
285 | 	descriptors to the sandbox can be desirable, so you can use this switch
286 | 	to make them survive.
287 | 
288 | *--tty-hack*::
289 | 	Try to handle interactive programs communicating over a tty.
290 | 	The sandboxed program will run in a separate process group, which will temporarily
291 | 	become the foreground process group of the terminal. When the program exits, the
292 | 	process group will be switched back to the caller. Please note that the program
293 | 	can do many nasty things including (but not limited to) changing terminal settings,
294 | 	changing the line discipline, and stuffing characters to the terminal's input queue
295 | 	using the TIOCSTI ioctl. Use with extreme caution.
296 | 
297 | *--special-files*::
298 | 	By default, Isolate removes all special files (other than regular files
299 | 	and directories) created inside the sandbox. If you need them, this option disables
300 | 	that behavior, but you need to carefully check what you open.
301 | 
302 | *--as-uid=*'uid', *--as-gid=*'gid'::
303 | 	Act on behalf of the specified user and group (only if Isolate was invoked by root).
304 | 	This is used in scenarios where a root-controlled process manages creation of sandboxes
305 | 	for regular users, usually in conjunction with the *restricted_init* option in
306 | 	the configuration file.
307 | 
308 | META-FILES
309 | ----------
310 | The meta-file contains miscellaneous meta-information on execution of the
311 | program within the sandbox. It is a textual file consisting of lines
312 | of format 'key'*:*'value'. The following keys are defined:
313 | 
314 | *cg-mem*::
315 | 	When control groups are enabled, this is the total memory use
316 | 	by the whole control group (in kilobytes). If you use *isolate --run*
317 | 	multiple times in the same sandbox, the control group retains cached
318 | 	data from the previous runs, which also contributes to *cg-mem*.
319 | *cg-oom-killed*::
320 | 	Present when the program was killed by the out-of-memory killer
321 | 	(e.g., because it has exceeded the memory limit of its control group).
322 | 	This is reported only on Linux 4.13 and later.
323 | *csw-forced*::
324 | 	Number of context switches forced by the kernel.
325 | *csw-voluntary*::
326 | 	Number of context switches caused by the process giving up the CPU
327 | 	voluntarily.
328 | *exitcode*::
329 | 	The program has exited normally with this exit code.
330 | *exitsig*::
331 | 	The program has exited after receiving this fatal signal.
332 | *killed*::
333 | 	Present when the program was terminated by the sandbox
334 | 	(e.g., because it has exceeded the time limit).
335 | *max-rss*::
336 | 	Maximum resident set size of the process (in kilobytes).
337 | *message*::
338 | 	Status message, not intended for machine processing.
339 | 	E.g., "Time limit exceeded."
340 | *status*::
341 | 	Two-letter status code:
342 | 	* *RE* -- run-time error, i.e., exited with a non-zero exit code
343 | 	* *SG* -- program died on a signal
344 | 	* *TO* -- timed out
345 | 	* *XX* -- internal error of the sandbox
346 | *time*::
347 | 	Run time of the program in fractional seconds.
348 | *time-wall*::
349 | 	Wall clock time of the program in fractional seconds.
350 | 
351 | Please note that not all keys have to be present.
352 | For example, no *status* nor *message* is reported upon normal termination.
353 | 
354 | RETURN VALUE
355 | ------------
356 | When the program inside the sandbox finishes correctly, the sandbox returns 0.
357 | If it finishes incorrectly, it returns 1.
358 | All other return codes signal an internal error.
359 | 
360 | INSTALLATION
361 | ------------
362 | Isolate depends on several advanced features of the Linux kernel, like different
363 | kinds of namespaces and control groups. These features are available in kernels
364 | of most Linux distributions now, but if you are building your own kernel, you
365 | have to be careful.
366 | 
367 | Isolate is designed to run setuid to root. The sub-process inside the sandbox
368 | then switches to a non-privileged user ID (different for each *--box-id*).
369 | The range of UIDs available and several filesystem paths are set in a configuration
370 | file, by default located in /usr/local/etc/isolate.
371 | 
372 | For control group mode:
373 | 
374 | - Linux supports two incompatible implementations of control groups: cgroup v1 and v2.
375 |   This version of Isolate requires v2, which is the default on recent systems.
376 | 
377 | - If you are using systemd, you need to start the `isolate.service` (see service files
378 |   in the `systemd` directory in Isolate's source tree). It establishes
379 |   `isolate.scope` whose cgroup subtree is delegated to Isolate by systemd.
380 |   The service runs a simple daemon *isolate-cg-keeper*(8) to keep the scope alive.
381 | 
382 | - If you are not using systemd, make sure that Isolate's configuration file
383 |   refers to the correct location where you have the cgroup filesystem mounted.
384 |   Also make sure that whatever service manager you are using, it does not
385 |   interfere with Isolate's use of control groups.
386 | 
387 | - Running Isolate in containers is not recommended, since container managers
388 |   usually do not delegate control groups properly. Besides, you do not want
389 |   to share the machine with other workloads, which would influence measurement
390 |   of execution time. If you still want to use containers, you are on your own
391 |   and you probably have to make them privileged.
392 | 
393 | - Reporting memory usage requires Linux kernel 5.19 or newer.
394 | 
395 | - Since memory limits do not affect swapped-out data, we recommend turning off
396 |   swap completely.
397 | 
398 | Isolate expects that the root directory "/" is a mount point. When running
399 | isolate inside a chroot, this may not be the case, and isolate may fail with
400 | "Cannot privatize mounts". A workaround for this is to convert the root
401 | directory of the chroot into a mount point using a bind mount, prior to
402 | entering the chroot and running isolate. For example:
403 | 
404 |     mount --bind /path/to/chroot /path/to/chroot
405 | 
406 | It is recommended to have +sys.fs.protected_hardlinks+ sysctl set to 1
407 | (which is probably default on modern Linux systems). Otherwise, the user running
408 | the sandbox could trick isolate to changing the owner of unrelated files.
409 | 
410 | If you have systemd-coredump installed, please keep in mind that it records core
411 | files even for processes inside the sandbox. As it configures the kernel to deliver
412 | core dumps using a pipe, it is not affected by the *--core* limit.
413 | 
414 | REPRODUCIBILITY
415 | ---------------
416 | The reproducibility of results can be improved by tuning some kernel
417 | parameters, listed below. Some of these parameters can be checked using the
418 | program isolate-check-environment.
419 | 
420 | * Disable address space randomization: +sysctl kernel.randomize_va_space=0+.
421 | Address space randomization can affect timing, memory usage, and program
422 | behavior. This setting can be made persistent through /etc/sysctl.d/.
423 | 
424 | * Disable dynamic CPU frequency scaling. This is done by setting the cpufreq
425 | scaling governor in /sys/device/system/cpu/cpufreq/*/scaling_governor to +performance+.
426 | (On Intel CPUs, frequency scaling can be controlled by the `intel_pstate` driver,
427 | but it still provides its own +performance+ controller to the cpufreq subsystem.)
428 | 
429 | * Consider disabling frequency boosting on CPUs that might support it (this
430 | includes most i3/i5/i7 Intel CPUs and the AMD Zen architecture). This is done
431 | either by writing 1 to /sys/devices/system/cpu/intel_pstate/no_turbo (on Intel CPUs)
432 | or by writing 0 to /sys/devices/system/cpu/cpufreq/boost (other machines).
433 | 
434 | * Run evaluations on a single CPU (core). The Linux scheduler has a tendency to randomly
435 | migrate tasks between CPUs, incurring cache migration costs. You can use isolate's
436 | configuration file to pin the process to a specified CPU.
437 | 
438 | * If you have CPU with a mix of different cores (e.g., P-cores and E-cores in certain Intel CPUs),
439 | pin the sandbox to a homogeneous subset of cores.
440 | 
441 | * Disable automatic kernel support for transparent huge pages. Both /sys/kernel/mm/transparent_hugepage/enabled
442 | and /sys/kernel/mm/transparent_hugepage/defrag should be set to "madvise" or "never", and
443 | /sys/kernel/mm/transparent_hugepage/khugepaged/defrag to 0.
444 | 
445 | * Disable swapping. If you really need swap space and you are using cgroups,
446 | make sure that you have the memsw controller enabled, so that swap space is
447 | properly accounted for.
448 | 
449 | See further suggestions in the https://ioi.github.io/checklist/[IOI Technical Checklist].
450 | 
451 | LICENSE
452 | -------
453 | Isolate was written by Martin Mares and Bernard Blackham.
454 | It can be distributed and used under the terms of the GNU
455 | General Public License version 2 or any later version.
456 | 
457 | SEE ALSO
458 | --------
459 | *isolate-check-environment*(8), *isolate-cg-keeper*(8)
460 | 


--------------------------------------------------------------------------------
/isolate.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  *	A Process Isolator based on Linux Containers
   3 |  *
   4 |  *	(c) 2012-2024 Martin Mares <mj@ucw.cz>
   5 |  *	(c) 2012-2014 Bernard Blackham <bernard@blackham.com.au>
   6 |  */
   7 | 
   8 | #include "isolate.h"
   9 | 
  10 | #include <assert.h>
  11 | #include <errno.h>
  12 | #include <fcntl.h>
  13 | #include <getopt.h>
  14 | #include <grp.h>
  15 | #include <limits.h>
  16 | #include <sched.h>
  17 | #include <stdio.h>
  18 | #include <stdlib.h>
  19 | #include <string.h>
  20 | #include <net/if.h>
  21 | #include <sys/file.h>
  22 | #include <sys/mount.h>
  23 | #include <sys/resource.h>
  24 | #include <sys/signal.h>
  25 | #include <sys/socket.h>
  26 | #include <sys/stat.h>
  27 | #include <sys/time.h>
  28 | #include <sys/vfs.h>
  29 | #include <sys/wait.h>
  30 | #include <time.h>
  31 | #include <unistd.h>
  32 | 
  33 | /* May not be defined in older glibc headers */
  34 | #ifndef MS_PRIVATE
  35 | #warning "Working around old glibc: no MS_PRIVATE"
  36 | #define MS_PRIVATE (1 << 18)
  37 | #endif
  38 | #ifndef MS_REC
  39 | #warning "Working around old glibc: no MS_REC"
  40 | #define MS_REC     (1 << 14)
  41 | #endif
  42 | 
  43 | /*
  44 |  * Theory of operation
  45 |  *
  46 |  * Generally, we want to run a process inside a namespace/cgroup and watch it
  47 |  * from the outside. However, the reality is a little bit more complicated as we
  48 |  * do not want the inside process to become the init process of the PID namespace
  49 |  * (we want to have all signals properly delivered).
  50 |  *
  51 |  * We are running three processes:
  52 |  *
  53 |  *   - Keeper process (root privileges, parent namespace, parent cgroups)
  54 |  *   - Proxy process (UID/GID of the calling user, init process of the child
  55 |  *     namespace, parent cgroups)
  56 |  *   - Inside process (per-box UID/GID, child namespace, child cgroups)
  57 |  *
  58 |  * The proxy process just waits for the inside process to exit and then it passes
  59 |  * the exit status to the keeper.
  60 |  *
  61 |  * We use two pipes:
  62 |  *
  63 |  *   - Error pipe for error messages produced by the proxy process and the early
  64 |  *     stages of the inside process (until exec()). Listened to by the keeper.
  65 |  *   - Status pipe for passing the PID of the inside process and its exit status
  66 |  *     from the proxy to the keeper.
  67 |  */
  68 | 
  69 | #define TIMER_INTERVAL_US 100000
  70 | 
  71 | static int timeout;			/* milliseconds */
  72 | static int wall_timeout;
  73 | static int extra_timeout;
  74 | int pass_environ;
  75 | int verbose;
  76 | static int silent;
  77 | static int fsize_limit;
  78 | static int memory_limit;
  79 | static int stack_limit;
  80 | static int open_file_limit = 64;
  81 | static int core_limit;
  82 | int block_quota;
  83 | int inode_quota;
  84 | static int max_processes = 1;
  85 | static char *redir_stdin, *redir_stdout, *redir_stderr;
  86 | static int redir_stderr_to_stdout;
  87 | static char *set_cwd;
  88 | static int share_net;
  89 | static int inherit_fds;
  90 | static int default_dirs = 1;
  91 | static int tty_hack;
  92 | static bool special_files;
  93 | static bool wait_if_busy;
  94 | static int as_uid = -1;
  95 | static int as_gid = -1;
  96 | 
  97 | int cg_enable;
  98 | int cg_memory_limit;
  99 | 
 100 | int box_id;
 101 | static char box_dir[1024];
 102 | static pid_t box_pid;
 103 | static pid_t proxy_pid;
 104 | 
 105 | uid_t box_uid;
 106 | gid_t box_gid;
 107 | uid_t orig_uid;
 108 | gid_t orig_gid;
 109 | static bool invoked_by_root;
 110 | 
 111 | static int partial_line;
 112 | static int cleanup_ownership;
 113 | 
 114 | static struct timespec start_time;
 115 | static int ticks_per_sec;
 116 | static int total_ms, wall_ms;
 117 | static volatile sig_atomic_t timer_tick, interrupt;
 118 | 
 119 | static int error_pipes[2];
 120 | static int write_errors_to_fd;
 121 | static int read_errors_from_fd;
 122 | 
 123 | static int status_pipes[2];
 124 | 
 125 | static int get_wall_time_ms(void);
 126 | static int get_run_time_ms(struct rusage *rus);
 127 | 
 128 | /*** Locks ***/
 129 | 
 130 | /*
 131 |  *  Whenever a sandbox is initialized, a lock file is created, which
 132 |  *  records which user owns the sandbox and whether the cgroup mode is used.
 133 |  *  Atempts to use the same sandbox by a different user are refused.
 134 |  *
 135 |  *  The lock file is locked whenever Isolate runs in that sandbox.
 136 |  */
 137 | 
 138 | #define LOCK_MAGIC 0x48736f6c
 139 | 
 140 | struct lock_record {
 141 |   uint32_t magic;
 142 |   uint32_t owner_uid;
 143 |   unsigned char cg_enabled;
 144 |   unsigned char is_initialized;
 145 |   unsigned char rfu[2];
 146 | };
 147 | 
 148 | static int lock_fd = -1;
 149 | static struct lock_record lock;
 150 | 
 151 | static void
 152 | lock_write(void)
 153 | {
 154 |   int n = pwrite(lock_fd, &lock, sizeof(lock), 0);
 155 |   if (n != sizeof(lock))
 156 |     die("Cannot write lock file: %m");
 157 | }
 158 | 
 159 | static bool
 160 | lock_box(bool is_init)
 161 | {
 162 |   if (!dir_exists(cf_lock_root))
 163 |     make_dir(cf_lock_root);
 164 | 
 165 |   char lock_name[256];
 166 |   int name_len = snprintf(lock_name, sizeof(lock_name), "%s/%d", cf_lock_root, box_id);
 167 |   assert(name_len < (int) sizeof(lock_name));
 168 | 
 169 |   lock_fd = open(lock_name, O_RDWR | (is_init ? O_CREAT : 0), 0666);
 170 |   if (lock_fd < 0)
 171 |     {
 172 |       if (errno == ENOENT)
 173 | 	return false;
 174 |       die("Cannot open %s: %m", lock_name);
 175 |     }
 176 | 
 177 |   if (flock(lock_fd, LOCK_EX | (wait_if_busy ? 0 : LOCK_NB)) < 0)
 178 |     {
 179 |       if (errno == EWOULDBLOCK)
 180 | 	die("This box is currently in use by another process");
 181 |       die("Cannot lock %s: %m", lock_name);
 182 |     }
 183 | 
 184 |   int n = read(lock_fd, &lock, sizeof(lock));
 185 |   if (n < 0)
 186 |     die("Cannot read %s: %m", lock_name);
 187 | 
 188 |   if (n > 0)
 189 |     {
 190 |       if (n != sizeof(lock) || lock.magic != LOCK_MAGIC)
 191 | 	die("Lock file %s has incompatible format", lock_name);
 192 |       if (lock.is_initialized && lock.owner_uid != orig_uid && !invoked_by_root)
 193 | 	die("This box belongs to a different user (uid %d)", lock.owner_uid);
 194 |       if (lock.cg_enabled != cg_enable)
 195 | 	die("This box was initialized with an incompatible control group mode");
 196 |     }
 197 | 
 198 |   if (is_init)
 199 |     {
 200 |       lock.magic = LOCK_MAGIC;
 201 |       lock.owner_uid = orig_uid;
 202 |       lock.cg_enabled = cg_enable;
 203 |       lock.is_initialized = 0;
 204 |       lock_write();
 205 |       return true;
 206 |     }
 207 |   else
 208 |     {
 209 |       if (n > 0)
 210 | 	{
 211 | 	  if (!lock.is_initialized)
 212 | 	    die("This box was not initialized properly");
 213 | 	  return true;
 214 | 	}
 215 |       else
 216 | 	{
 217 | 	  // This means that somebody else is just creating the sandbox and we locked it
 218 | 	  // between his creation of the lock file and locking it.
 219 | 	  return false;
 220 | 	}
 221 |     }
 222 | 
 223 |   // The acquired lock will be automatically released on process exit.
 224 | }
 225 | 
 226 | static void
 227 | lock_close(void)
 228 | {
 229 |   if (lock_fd >= 0)
 230 |     {
 231 |       close(lock_fd);
 232 |       lock_fd = -1;
 233 |     }
 234 | }
 235 | 
 236 | static void
 237 | lock_remove(void)
 238 | {
 239 |   // To avoid race conditions, we must never unlink lock files.
 240 |   // We just truncate them to zero length.
 241 |   assert(lock_fd >= 0);
 242 |   if (ftruncate(lock_fd, 0) < 0)
 243 |     die("Cannot truncate lock file: %m");
 244 |   close(lock_fd);
 245 |   lock_fd = -1;
 246 | }
 247 | 
 248 | /*** Messages and exits ***/
 249 | 
 250 | static void
 251 | final_stats(struct rusage *rus)
 252 | {
 253 |   total_ms = get_run_time_ms(rus);
 254 |   wall_ms = get_wall_time_ms();
 255 | 
 256 |   meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000);
 257 |   meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000);
 258 |   meta_printf("max-rss:%ld\n", rus->ru_maxrss);
 259 |   meta_printf("csw-voluntary:%ld\n", rus->ru_nvcsw);
 260 |   meta_printf("csw-forced:%ld\n", rus->ru_nivcsw);
 261 | 
 262 |   cg_stats();
 263 | }
 264 | 
 265 | static void NONRET
 266 | box_exit(int rc)
 267 | {
 268 |   if (proxy_pid > 0)
 269 |     {
 270 |       if (box_pid > 0)
 271 | 	{
 272 | 	  kill(-box_pid, SIGKILL);
 273 | 	  kill(box_pid, SIGKILL);
 274 | 	}
 275 |       if (cg_enable)
 276 | 	{
 277 | 	  /*
 278 | 	   *  In non-CG mode, we must not kill the proxy explicitly.
 279 | 	   *  This is important, because the proxy could exit before the box
 280 | 	   *  completes its exit, causing rusage of the box to be lost.
 281 | 	   *
 282 | 	   *  In CG mode, we must kill the proxy, because it is the init
 283 | 	   *  process of the CG and killing it causes all other processes
 284 | 	   *  inside the CG to be killed. However, we do not care about
 285 | 	   *  rusage.
 286 | 	   */
 287 | 	  kill(-proxy_pid, SIGKILL);
 288 | 	  kill(proxy_pid, SIGKILL);
 289 | 	}
 290 |       meta_printf("killed:1\n");
 291 | 
 292 |       /*
 293 |        *  The rusage will contain time spent by the proxy and its children (i.e., the box).
 294 |        *  (See comments on killing of the proxy above, though.)
 295 |        */
 296 |       struct rusage rus;
 297 |       int p, stat;
 298 |       do
 299 | 	p = wait4(proxy_pid, &stat, 0, &rus);
 300 |       while (p < 0 && errno == EINTR);
 301 |       if (p < 0)
 302 | 	fprintf(stderr, "UGH: Lost track of the process (%m)\n");
 303 |       else
 304 | 	final_stats(&rus);
 305 |     }
 306 | 
 307 |   if (tty_hack && isatty(1))
 308 |     {
 309 |       /*
 310 |        *  If stdout is a tty, make us the foreground process group again.
 311 |        *  We do not need it (we ignore SIGTTOU anyway), but programs executed
 312 |        *  after our exit will.
 313 |        */
 314 |       tcsetpgrp(1, getpgrp());
 315 |     }
 316 | 
 317 |   if (rc < 2 && cleanup_ownership)
 318 |     chowntree("box", orig_uid, orig_gid, special_files);
 319 | 
 320 |   meta_close();
 321 |   exit(rc);
 322 | }
 323 | 
 324 | static void
 325 | flush_line(void)
 326 | {
 327 |   if (partial_line)
 328 |     fputc('\n', stderr);
 329 |   partial_line = 0;
 330 | }
 331 | 
 332 | /* Report an error of the sandbox itself */
 333 | void NONRET __attribute__((format(printf,1,2)))
 334 | die(char *msg, ...)
 335 | {
 336 |   va_list args;
 337 |   va_start(args, msg);
 338 |   char buf[1024];
 339 |   int n = vsnprintf(buf, sizeof(buf), msg, args);
 340 | 
 341 |   // If the child processes are still running, show no mercy.
 342 |   if (box_pid > 0)
 343 |     {
 344 |       kill(-box_pid, SIGKILL);
 345 |       kill(box_pid, SIGKILL);
 346 |     }
 347 |   if (proxy_pid > 0)
 348 |     {
 349 |       kill(-proxy_pid, SIGKILL);
 350 |       kill(proxy_pid, SIGKILL);
 351 |     }
 352 | 
 353 |   if (write_errors_to_fd)
 354 |     {
 355 |       // We are inside the box, have to use error pipe for error reporting.
 356 |       // We hope that the whole error message fits in PIPE_BUF bytes.
 357 |       write(write_errors_to_fd, buf, n);
 358 |       exit(2);
 359 |     }
 360 | 
 361 |   // Otherwise, we in the box keeper process, so we report errors normally
 362 |   flush_line();
 363 |   meta_printf("status:XX\nmessage:%s\n", buf);
 364 |   fputs(buf, stderr);
 365 |   fputc('\n', stderr);
 366 |   box_exit(2);
 367 | }
 368 | 
 369 | /* Report an error of the program inside the sandbox */
 370 | void NONRET __attribute__((format(printf,1,2)))
 371 | err(char *msg, ...)
 372 | {
 373 |   va_list args;
 374 |   va_start(args, msg);
 375 |   flush_line();
 376 |   if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ')
 377 |     {
 378 |       meta_printf("status:%c%c\n", msg[0], msg[1]);
 379 |       msg += 4;
 380 |     }
 381 |   char buf[1024];
 382 |   vsnprintf(buf, sizeof(buf), msg, args);
 383 |   meta_printf("message:%s\n", buf);
 384 |   if (!silent)
 385 |     {
 386 |       fputs(buf, stderr);
 387 |       fputc('\n', stderr);
 388 |     }
 389 |   box_exit(1);
 390 | }
 391 | 
 392 | /* Write a message, but only if in verbose mode */
 393 | void __attribute__((format(printf,1,2)))
 394 | msg(char *msg, ...)
 395 | {
 396 |   va_list args;
 397 |   va_start(args, msg);
 398 |   if (verbose)
 399 |     {
 400 |       int len = strlen(msg);
 401 |       if (len > 0)
 402 |         partial_line = (msg[len-1] != '\n');
 403 |       vfprintf(stderr, msg, args);
 404 |       fflush(stderr);
 405 |     }
 406 |   va_end(args);
 407 | }
 408 | 
 409 | /*** Signal handling in keeper process ***/
 410 | 
 411 | /*
 412 |  *   Signal handling is tricky. We must set up signal handlers before
 413 |  *   we start the child process (and reset them in the child process).
 414 |  *   Otherwise, there is a short time window where a SIGINT can kill
 415 |  *   us and leave the child process running.
 416 |  */
 417 | 
 418 | struct signal_rule {
 419 |   int signum;
 420 |   enum { SIGNAL_IGNORE, SIGNAL_INTERRUPT, SIGNAL_FATAL } action;
 421 | };
 422 | 
 423 | static const struct signal_rule signal_rules[] = {
 424 |   { SIGHUP,	SIGNAL_INTERRUPT },
 425 |   { SIGINT,	SIGNAL_INTERRUPT },
 426 |   { SIGQUIT,	SIGNAL_INTERRUPT },
 427 |   { SIGILL,	SIGNAL_FATAL },
 428 |   { SIGABRT,	SIGNAL_FATAL },
 429 |   { SIGFPE,	SIGNAL_FATAL },
 430 |   { SIGSEGV,	SIGNAL_FATAL },
 431 |   { SIGPIPE,	SIGNAL_IGNORE },
 432 |   { SIGTERM,	SIGNAL_INTERRUPT },
 433 |   { SIGUSR1,	SIGNAL_IGNORE },
 434 |   { SIGUSR2,	SIGNAL_IGNORE },
 435 |   { SIGBUS,	SIGNAL_FATAL },
 436 |   { SIGTTOU,	SIGNAL_IGNORE },
 437 | };
 438 | 
 439 | static void
 440 | signal_alarm(int unused UNUSED)
 441 | {
 442 |   /* Time limit checks are synchronous, so we only schedule them there. */
 443 |   timer_tick = 1;
 444 |   msg("[timer]");
 445 | }
 446 | 
 447 | static void
 448 | signal_int(int signum)
 449 | {
 450 |   /* Interrupts (e.g., SIGINT) are synchronous, too. */
 451 |   interrupt = signum;
 452 | }
 453 | 
 454 | static void
 455 | signal_fatal(int signum)
 456 | {
 457 |   /* If we receive SIGSEGV or a similar signal, we try to die gracefully. */
 458 |   die("Sandbox keeper received fatal signal %d", signum);
 459 | }
 460 | 
 461 | static void
 462 | setup_signals(void)
 463 | {
 464 |   struct sigaction sa_int, sa_fatal;
 465 |   bzero(&sa_int, sizeof(sa_int));
 466 |   sa_int.sa_handler = signal_int;
 467 |   bzero(&sa_fatal, sizeof(sa_fatal));
 468 |   sa_fatal.sa_handler = signal_fatal;
 469 | 
 470 |   for (int i=0; i < ARRAY_SIZE(signal_rules); i++)
 471 |     {
 472 |       const struct signal_rule *sr = &signal_rules[i];
 473 |       switch (sr->action)
 474 | 	{
 475 | 	case SIGNAL_IGNORE:
 476 | 	  signal(sr->signum, SIG_IGN);
 477 | 	  break;
 478 | 	case SIGNAL_INTERRUPT:
 479 | 	  sigaction(sr->signum, &sa_int, NULL);
 480 | 	  break;
 481 | 	case SIGNAL_FATAL:
 482 | 	  sigaction(sr->signum, &sa_fatal, NULL);
 483 | 	  break;
 484 | 	default:
 485 | 	  die("Invalid signal rule");
 486 | 	}
 487 |     }
 488 | }
 489 | 
 490 | static void
 491 | reset_signals(void)
 492 | {
 493 |   for (int i=0; i < ARRAY_SIZE(signal_rules); i++)
 494 |     signal(signal_rules[i].signum, SIG_DFL);
 495 | }
 496 | 
 497 | /*** The keeper process ***/
 498 | 
 499 | #define PROC_BUF_SIZE 4096
 500 | static int
 501 | read_proc_file(char *buf, char *name, int *fdp)
 502 | {
 503 |   int c;
 504 | 
 505 |   if (*fdp < 0)
 506 |     {
 507 |       snprintf(buf, PROC_BUF_SIZE, "/proc/%d/%s", (int) box_pid, name);
 508 |       *fdp = open(buf, O_RDONLY);
 509 |       if (*fdp < 0)
 510 | 	return 0;	// This is OK, the process could have finished
 511 |     }
 512 |   lseek(*fdp, 0, SEEK_SET);
 513 |   if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0)
 514 |     {
 515 |       // Even this could fail if the process disappeared since open()
 516 |       return 0;
 517 |     }
 518 |   if (c >= PROC_BUF_SIZE-1)
 519 |     die("/proc/$pid/%s too long", name);
 520 |   buf[c] = 0;
 521 |   return 1;
 522 | }
 523 | 
 524 | static int
 525 | get_wall_time_ms(void)
 526 | {
 527 |   struct timespec now, wall;
 528 |   clock_gettime(CLOCK_MONOTONIC, &now);
 529 |   timespec_sub(&now, &start_time, &wall);
 530 |   return wall.tv_sec*1000 + wall.tv_nsec/1000000;
 531 | }
 532 | 
 533 | static int
 534 | get_run_time_ms(struct rusage *rus)
 535 | {
 536 |   if (cg_enable)
 537 |     return cg_get_run_time_ms();
 538 | 
 539 |   if (rus)
 540 |     {
 541 |       struct timeval total;
 542 |       timeradd(&rus->ru_utime, &rus->ru_stime, &total);
 543 |       return total.tv_sec*1000 + total.tv_usec/1000;
 544 |     }
 545 | 
 546 |   // It might happen that we do not know the box_pid (see comments in find_box_pid())
 547 |   if (!box_pid)
 548 |     return 0;
 549 | 
 550 |   char buf[PROC_BUF_SIZE], *x;
 551 |   int utime, stime;
 552 |   static int proc_stat_fd = -1;
 553 | 
 554 |   if (!read_proc_file(buf, "stat", &proc_stat_fd))
 555 |     return 0;
 556 |   x = buf;
 557 |   while (*x && *x != ' ')
 558 |     x++;
 559 |   while (*x == ' ')
 560 |     x++;
 561 |   if (*x++ != '(')
 562 |     die("proc stat syntax error 1");
 563 |   while (*x && (*x != ')' || x[1] != ' '))
 564 |     x++;
 565 |   while (*x == ')' || *x == ' ')
 566 |     x++;
 567 |   if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2)
 568 |     die("proc stat syntax error 2");
 569 | 
 570 |   return (utime + stime) * 1000 / ticks_per_sec;
 571 | }
 572 | 
 573 | static void
 574 | check_timeout(void)
 575 | {
 576 |   if (wall_timeout)
 577 |     {
 578 |       int wall_ms = get_wall_time_ms();
 579 |       if (wall_ms > wall_timeout)
 580 |         err("TO: Time limit exceeded (wall clock)");
 581 |       if (verbose > 1)
 582 |         fprintf(stderr, "[wall time check: %d msec]\n", wall_ms);
 583 |     }
 584 |   if (timeout)
 585 |     {
 586 |       int ms = get_run_time_ms(NULL);
 587 |       if (verbose > 1)
 588 | 	fprintf(stderr, "[time check: %d msec]\n", ms);
 589 |       if (ms > timeout && ms > extra_timeout)
 590 | 	err("TO: Time limit exceeded");
 591 |     }
 592 | }
 593 | 
 594 | static void
 595 | box_keeper(void)
 596 | {
 597 |   read_errors_from_fd = error_pipes[0];
 598 |   close(error_pipes[1]);
 599 |   close(status_pipes[1]);
 600 | 
 601 |   clock_gettime(CLOCK_MONOTONIC, &start_time);
 602 |   ticks_per_sec = sysconf(_SC_CLK_TCK);
 603 |   if (ticks_per_sec <= 0)
 604 |     die("Invalid ticks_per_sec!");
 605 | 
 606 |   if (timeout || wall_timeout)
 607 |     {
 608 |       struct sigaction sa;
 609 |       bzero(&sa, sizeof(sa));
 610 |       sa.sa_handler = signal_alarm;
 611 |       sigaction(SIGALRM, &sa, NULL);
 612 |       struct itimerval timer = {
 613 | 	.it_interval = { .tv_usec = TIMER_INTERVAL_US },
 614 | 	.it_value = { .tv_usec = TIMER_INTERVAL_US },
 615 |       };
 616 |       setitimer(ITIMER_REAL, &timer, NULL);
 617 |     }
 618 | 
 619 |   for(;;)
 620 |     {
 621 |       struct rusage rus;
 622 |       int stat;
 623 |       pid_t p;
 624 |       if (interrupt)
 625 | 	{
 626 | 	  meta_printf("exitsig:%d\n", interrupt);
 627 | 	  err("SG: Interrupted");
 628 | 	}
 629 |       if (timer_tick)
 630 | 	{
 631 | 	  check_timeout();
 632 | 	  timer_tick = 0;
 633 | 	}
 634 |       p = wait4(proxy_pid, &stat, 0, &rus);
 635 |       if (p < 0)
 636 | 	{
 637 | 	  if (errno == EINTR)
 638 | 	    continue;
 639 | 	  die("wait4: %m");
 640 | 	}
 641 |       if (p != proxy_pid)
 642 | 	die("wait4: unknown pid %d exited!", p);
 643 |       proxy_pid = 0;
 644 | 
 645 |       // Check error pipe if there is an internal error passed from inside the box
 646 |       char interr[1024];
 647 |       int n = read(read_errors_from_fd, interr, sizeof(interr) - 1);
 648 |       if (n > 0)
 649 | 	{
 650 | 	  interr[n] = 0;
 651 | 	  die("%s", interr);
 652 | 	}
 653 | 
 654 |       // Check status pipe if there is an exit status reported by the proxy process
 655 |       n = read(status_pipes[0], &stat, sizeof(stat));
 656 |       if (n != sizeof(stat))
 657 | 	die("Did not receive exit status from proxy");
 658 | 
 659 |       // At this point, the rusage includes time spent by the proxy's children.
 660 |       final_stats(&rus);
 661 |       if (timeout && total_ms > timeout)
 662 | 	err("TO: Time limit exceeded");
 663 |       if (wall_timeout && wall_ms > wall_timeout)
 664 | 	err("TO: Time limit exceeded (wall clock)");
 665 | 
 666 |       if (WIFEXITED(stat))
 667 | 	{
 668 | 	  meta_printf("exitcode:%d\n", WEXITSTATUS(stat));
 669 | 	  if (WEXITSTATUS(stat))
 670 | 	    err("RE: Exited with error status %d", WEXITSTATUS(stat));
 671 | 	  flush_line();
 672 | 	  if (!silent)
 673 | 	    {
 674 | 	      fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n",
 675 | 		total_ms/1000, total_ms%1000,
 676 | 		wall_ms/1000, wall_ms%1000);
 677 | 	    }
 678 | 	  box_exit(0);
 679 | 	}
 680 |       else if (WIFSIGNALED(stat))
 681 | 	{
 682 | 	  meta_printf("exitsig:%d\n", WTERMSIG(stat));
 683 | 	  err("SG: Caught fatal signal %d", WTERMSIG(stat));
 684 | 	}
 685 |       else if (WIFSTOPPED(stat))
 686 | 	{
 687 | 	  meta_printf("exitsig:%d\n", WSTOPSIG(stat));
 688 | 	  err("SG: Stopped by signal %d", WSTOPSIG(stat));
 689 | 	}
 690 |       else
 691 | 	die("wait4: unknown status %x, giving up!", stat);
 692 |     }
 693 | }
 694 | 
 695 | /*** The process running inside the box ***/
 696 | 
 697 | static void
 698 | setup_root(void)
 699 | {
 700 |   if (mkdir("root", 0750) < 0 && errno != EEXIST)
 701 |     die("mkdir('root'): %m");
 702 | 
 703 |   /*
 704 |    * Ensure all mounts are private, not shared. We don't want our mounts
 705 |    * appearing outside of our namespace.
 706 |    * (systemd since version 188 mounts filesystems shared by default).
 707 |    */
 708 |   if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0)
 709 |     die("Cannot privatize mounts: %m");
 710 | 
 711 |   if (mount("none", "root", "tmpfs", 0, "mode=755") < 0)
 712 |     die("Cannot mount root ramdisk: %m");
 713 | 
 714 |   apply_dir_rules(default_dirs);
 715 | 
 716 |   if (chroot("root") < 0)
 717 |     die("Chroot failed: %m");
 718 | 
 719 |   if (chdir("root/box") < 0)
 720 |     die("Cannot change current directory: %m");
 721 | }
 722 | 
 723 | static void
 724 | setup_net(void)
 725 | {
 726 |   if (share_net)
 727 |     return;
 728 | 
 729 |   int fd = socket(PF_INET, SOCK_DGRAM, 0);
 730 |   if (fd < 0)
 731 |     die("Cannot create PF_INET socket: %m");
 732 | 
 733 |   struct ifreq ifr = { .ifr_name = "lo" };
 734 |   if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0)
 735 |     die("SIOCGIFFLAGS on 'lo' failed: %m");
 736 | 
 737 |   ifr.ifr_flags |= IFF_UP;
 738 |   if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0)
 739 |     die("SIOCSIFFLAGS on 'lo' failed: %m");
 740 | 
 741 |   close(fd);
 742 | }
 743 | 
 744 | static void
 745 | setup_credentials(void)
 746 | {
 747 |   if (setresgid(box_gid, box_gid, box_gid) < 0)
 748 |     die("setresgid: %m");
 749 |   if (setgroups(0, NULL) < 0)
 750 |     die("setgroups: %m");
 751 |   if (setresuid(box_uid, box_uid, box_uid) < 0)
 752 |     die("setresuid: %m");
 753 |   setpgrp();
 754 |   if (tty_hack && isatty(1))
 755 |     {
 756 |       // If stdout is a tty, make us the foreground process group
 757 |       signal(SIGTTOU, SIG_IGN);
 758 |       tcsetpgrp(1, getpgrp());
 759 |       signal(SIGTTOU, SIG_DFL);
 760 |     }
 761 | }
 762 | 
 763 | static void
 764 | setup_fds(void)
 765 | {
 766 |   if (redir_stdin)
 767 |     {
 768 |       close(0);
 769 |       if (open(redir_stdin, O_RDONLY) != 0)
 770 | 	die("open(\"%s\"): %m", redir_stdin);
 771 |     }
 772 |   if (redir_stdout)
 773 |     {
 774 |       close(1);
 775 |       if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1)
 776 | 	die("open(\"%s\"): %m", redir_stdout);
 777 |     }
 778 |   if (redir_stderr)
 779 |     {
 780 |       close(2);
 781 |       if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2)
 782 | 	die("open(\"%s\"): %m", redir_stderr);
 783 |     }
 784 |   if (redir_stderr_to_stdout)
 785 |     {
 786 |       if (dup2(1, 2) < 0)
 787 | 	die("Cannot dup stdout to stderr: %m");
 788 |     }
 789 | }
 790 | 
 791 | static void
 792 | setup_rlim(const char *res_name, int res, rlim_t limit)
 793 | {
 794 |   struct rlimit rl = { .rlim_cur = limit, .rlim_max = limit };
 795 |   if (setrlimit(res, &rl) < 0)
 796 |     die("setrlimit(%s, %jd)", res_name, (intmax_t) limit);
 797 | }
 798 | 
 799 | static void
 800 | setup_rlimits(void)
 801 | {
 802 | #define RLIM(res, val) setup_rlim("RLIMIT_" #res, RLIMIT_##res, val)
 803 | 
 804 |   if (memory_limit)
 805 |     RLIM(AS, (rlim_t)memory_limit * 1024);
 806 | 
 807 |   if (fsize_limit)
 808 |     RLIM(FSIZE, (rlim_t)fsize_limit * 1024);
 809 | 
 810 |   if (open_file_limit)
 811 |     RLIM(NOFILE, (rlim_t)open_file_limit);
 812 | 
 813 |   RLIM(STACK, (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY));
 814 |   RLIM(MEMLOCK, 0);
 815 |   RLIM(CORE, (rlim_t)core_limit * 1024);
 816 | 
 817 |   if (max_processes)
 818 |     RLIM(NPROC, max_processes);
 819 | 
 820 | #undef RLIM
 821 | }
 822 | 
 823 | static int
 824 | box_inside(char **args)
 825 | {
 826 |   cg_enter();
 827 |   setup_root();
 828 |   setup_net();
 829 |   setup_rlimits();
 830 |   setup_credentials();
 831 |   setup_fds();
 832 |   char **env = setup_environment();
 833 | 
 834 |   if (set_cwd && chdir(set_cwd))
 835 |     die("chdir: %m");
 836 | 
 837 |   execve(args[0], args, env);
 838 |   fprintf(stderr, "execve(\"%s\"): %m\n", args[0]);
 839 |   exit(127);
 840 | }
 841 | 
 842 | /*** Proxy ***/
 843 | 
 844 | static void
 845 | setup_orig_credentials(void)
 846 | {
 847 |   if (setresgid(orig_gid, orig_gid, orig_gid) < 0)
 848 |     die("setresgid: %m");
 849 |   if (setgroups(0, NULL) < 0)
 850 |     die("setgroups: %m");
 851 |   if (setresuid(orig_uid, orig_uid, orig_uid) < 0)
 852 |     die("setresuid: %m");
 853 | }
 854 | 
 855 | static int
 856 | box_proxy(void *arg)
 857 | {
 858 |   char **args = arg;
 859 | 
 860 |   write_errors_to_fd = error_pipes[1];
 861 |   close(error_pipes[0]);
 862 |   close(status_pipes[0]);
 863 |   meta_close();
 864 |   lock_close();
 865 |   reset_signals();
 866 | 
 867 |   pid_t inside_pid = fork();
 868 |   if (inside_pid < 0)
 869 |     die("Cannot run process, fork failed: %m");
 870 |   else if (!inside_pid)
 871 |     {
 872 |       close(status_pipes[1]);
 873 |       box_inside(args);
 874 |       _exit(42);	// We should never get here
 875 |     }
 876 | 
 877 |   setup_orig_credentials();
 878 |   if (write(status_pipes[1], &inside_pid, sizeof(inside_pid)) != sizeof(inside_pid))
 879 |     die("Proxy write to pipe failed: %m");
 880 | 
 881 |   int stat;
 882 |   pid_t p = waitpid(inside_pid, &stat, 0);
 883 |   if (p < 0)
 884 |     die("Proxy waitpid() failed: %m");
 885 | 
 886 |   if (write(status_pipes[1], &stat, sizeof(stat)) != sizeof(stat))
 887 |     die("Proxy write to pipe failed: %m");
 888 | 
 889 |   _exit(0);
 890 | }
 891 | 
 892 | static void
 893 | box_init(void)
 894 | {
 895 |   if (box_id < 0 || box_id >= cf_num_boxes)
 896 |     die("Sandbox ID out of range (allowed: 0-%d)", cf_num_boxes-1);
 897 |   box_uid = cf_first_uid + box_id;
 898 |   box_gid = cf_first_gid + box_id;
 899 | 
 900 |   snprintf(box_dir, sizeof(box_dir), "%s/%d", cf_box_root, box_id);
 901 | }
 902 | 
 903 | /*** Commands ***/
 904 | 
 905 | static const char *
 906 | self_name(void)
 907 | {
 908 |   return cg_enable ? "isolate --cg" : "isolate";
 909 | }
 910 | 
 911 | static void
 912 | get_credentials(void)
 913 | {
 914 |   if (geteuid())
 915 |     die("Must be started as root");
 916 |   if (getegid() && setegid(0) < 0)
 917 |     die("Cannot switch to root group: %m");
 918 | 
 919 |   orig_uid = getuid();
 920 |   orig_gid = getgid();
 921 |   invoked_by_root = !orig_uid;
 922 | 
 923 |   if (as_uid >= 0 || as_gid >= 0)
 924 |     {
 925 |       if (!invoked_by_root)
 926 | 	die("You must be root to use --as-uid or --as-gid");
 927 |       if (as_uid < 0 || as_gid < 0)
 928 | 	die("--as-uid and --as-gid must be used either both or none");
 929 |       orig_uid = as_uid;
 930 |       orig_gid = as_gid;
 931 |     }
 932 | }
 933 | 
 934 | static void
 935 | do_cleanup(void)
 936 | {
 937 |   if (dir_exists(box_dir))
 938 |     {
 939 |       msg("Removing box directory\n");
 940 |       rmtree(box_dir);
 941 |     }
 942 |   cg_remove();
 943 | }
 944 | 
 945 | static void
 946 | init(void)
 947 | {
 948 |   if (cf_restricted_init && !invoked_by_root)
 949 |     die("New sandboxes can be created only by root");
 950 | 
 951 |   lock_box(true);
 952 | 
 953 |   do_cleanup();
 954 | 
 955 |   msg("Preparing sandbox\n");
 956 |   make_dir(box_dir);
 957 |   if (chdir(box_dir) < 0)
 958 |     die("chdir(%s): %m", box_dir);
 959 |   if (mkdir("box", 0700) < 0)
 960 |     die("Cannot create box: %m");
 961 |   if (chown("box", orig_uid, orig_gid) < 0)
 962 |     die("Cannot chown box: %m");
 963 | 
 964 |   cg_create();
 965 |   set_quota();
 966 | 
 967 |   lock.is_initialized = 1;
 968 |   lock_write();
 969 | 
 970 |   puts(box_dir);
 971 | }
 972 | 
 973 | static void
 974 | cleanup(void)
 975 | {
 976 |   if (!lock_box(false))
 977 |     msg("Nothing to do -- box did not exist\n");
 978 |   else
 979 |     {
 980 |       msg("Deleting sandbox\n");
 981 |       do_cleanup();
 982 |       lock_remove();
 983 |     }
 984 | }
 985 | 
 986 | static void
 987 | setup_pipe(int *fds, int nonblocking)
 988 | {
 989 |   if (pipe(fds) < 0)
 990 |     die("pipe: %m");
 991 |   for (int i=0; i<2; i++)
 992 |     if (fcntl(fds[i], F_SETFD, fcntl(fds[i], F_GETFD) | FD_CLOEXEC) < 0 ||
 993 |         nonblocking && fcntl(fds[i], F_SETFL, fcntl(fds[i], F_GETFL) | O_NONBLOCK) < 0)
 994 |       die("fcntl on pipe: %m");
 995 | }
 996 | 
 997 | static void
 998 | find_box_pid(void)
 999 | {
1000 |   /*
1001 |    *  The box keeper process wants to poll status of the inside process,
1002 |    *  so it needs to know the box_pid. However, it is not easy to obtain:
1003 |    *  we got the PID from the proxy, but it is local to the PID namespace.
1004 |    *  Instead, we ask /proc to enumerate the children of the proxy.
1005 |    *
1006 |    *  CAVEAT: The timing is tricky. We know that the inside process was
1007 |    *  already started (passing the PID from the proxy to us guarantees it),
1008 |    *  but it might already have exited and be reaped by the proxy. Therefore
1009 |    *  it is correct if we fail to find anything.
1010 |    */
1011 | 
1012 |   char namebuf[256];
1013 |   snprintf(namebuf, sizeof(namebuf), "/proc/%d/task/%d/children", (int) proxy_pid, (int) proxy_pid);
1014 |   FILE *f = fopen(namebuf, "r");
1015 |   if (!f)
1016 |     return;
1017 | 
1018 |   int child;
1019 |   if (fscanf(f, "%d", &child) != 1)
1020 |     {
1021 |       fclose(f);
1022 |       return;
1023 |     }
1024 |   box_pid = child;
1025 | 
1026 |   if (fscanf(f, "%d", &child) == 1)
1027 |     die("Error parsing %s: unexpected children found", namebuf);
1028 | 
1029 |   fclose(f);
1030 | }
1031 | 
1032 | static void
1033 | run(char **argv)
1034 | {
1035 |   if (!lock_box(false))
1036 |     die("Box not found, did you run `%s --init'?", self_name());
1037 | 
1038 |   if (chdir(box_dir) < 0)
1039 |     die("chdir(%s): %m", box_dir);
1040 | 
1041 |   if (!inherit_fds)
1042 |     {
1043 |       keep_fd(lock_fd);
1044 |       close_all_fds();
1045 |     }
1046 | 
1047 |   chowntree("box", box_uid, box_gid, false);
1048 |   cleanup_ownership = 1;
1049 | 
1050 |   setup_pipe(error_pipes, 1);
1051 |   setup_pipe(status_pipes, 0);
1052 |   setup_signals();
1053 |   cg_setup();
1054 | 
1055 |   proxy_pid = clone(
1056 |     box_proxy,			// Function to execute as the body of the new process
1057 |     (void*)((uintptr_t)argv & ~(uintptr_t)15),	// Pass our stack, aligned to 16-bytes
1058 |     SIGCHLD | CLONE_NEWIPC | (share_net ? 0 : CLONE_NEWNET) | CLONE_NEWNS | CLONE_NEWPID,
1059 |     argv);			// Pass the arguments
1060 |   if (proxy_pid < 0)
1061 |     die("Cannot run proxy, clone failed: %m");
1062 |   if (!proxy_pid)
1063 |     die("Cannot run proxy, clone returned 0");
1064 | 
1065 |   pid_t box_pid_inside_ns;
1066 |   int n = read(status_pipes[0], &box_pid_inside_ns, sizeof(box_pid_inside_ns));
1067 |   if (n != sizeof(box_pid_inside_ns))
1068 |     die("Proxy failed before it passed box_pid: %m");
1069 |   find_box_pid();
1070 |   msg("Started proxy_pid=%d box_pid=%d box_pid_inside_ns=%d\n", (int) proxy_pid, (int) box_pid, (int) box_pid_inside_ns);
1071 | 
1072 |   box_keeper();
1073 | }
1074 | 
1075 | static void
1076 | show_version(void)
1077 | {
1078 |   printf("The process isolator " ISOLATE_VERSION "\n");
1079 |   printf("(c) 2012--" ISOLATE_YEAR " Martin Mares and Bernard Blackham\n");
1080 | #if defined(BUILD_DATE) && defined(BUILD_COMMIT)
1081 |   printf("Built on " BUILD_DATE " from Git commit " BUILD_COMMIT "\n");
1082 | #endif
1083 | }
1084 | 
1085 | /*** Options ***/
1086 | 
1087 | static void __attribute__((format(printf,1,2)))
1088 | usage(const char *msg, ...)
1089 | {
1090 |   if (msg != NULL)
1091 |     {
1092 |       va_list args;
1093 |       va_start(args, msg);
1094 |       vfprintf(stderr, msg, args);
1095 |       va_end(args);
1096 |     }
1097 |   printf("\
1098 | Usage: isolate [<options>] <command>\n\
1099 | \n\
1100 | Options:\n\
1101 |     --as-uid=<uid>\tPerform action on behalf of a given user (requires root)\n\
1102 |     --as-gid=<gid>\tPerform action on behalf of a given group (requires root)\n\
1103 | -b, --box-id=<id>\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\
1104 |     --cg\t\tEnable use of control groups\n\
1105 |     --cg-mem=<size>\tLimit memory usage of the control group to <size> KB\n\
1106 | -c, --chdir=<dir>\tChange directory to <dir> before executing the program\n\
1107 |     --core=<size>\tLimit core files to <size> KB (default: 0)\n\
1108 | -d, --dir=<dir>\t\tMake a directory <dir> visible inside the sandbox\n\
1109 |     --dir=<in>=<out>\tMake a directory <out> outside visible as <in> inside\n\
1110 |     --dir=<in>=\t\tDelete a previously defined directory rule (even a default one)\n\
1111 |     --dir=...:<opt>\tSpecify options for a rule:\n\
1112 | \t\t\t\tdev\tAllow access to block/char devices\n\
1113 | \t\t\t\tfs\tMount a filesystem (e.g., --dir=/proc:proc:fs)\n\
1114 | \t\t\t\tmaybe\tSkip the rule if <out> does not exist\n\
1115 | \t\t\t\tnoexec\tDo not allow execution of binaries\n\
1116 | \t\t\t\tnorec\tDo not bind the directory recursively\n\
1117 | \t\t\t\trw\tAllow read-write access\n\
1118 | \t\t\t\ttmp\tCreate as a temporary directory (implies rw)\n\
1119 | -D, --no-default-dirs\tDo not add default directory rules\n\
1120 | -f, --fsize=<size>\tMax size (in KB) of files that can be created\n\
1121 | -E, --env=<var>\t\tInherit the environment variable <var> from the parent process\n\
1122 | -E, --env=<var>=<val>\tSet the environment variable <var> to <val>; unset it if <var> is empty\n\
1123 | -x, --extra-time=<time>\tSet extra timeout, before which a timing-out program is not yet killed,\n\
1124 | \t\t\tso that its real execution time is reported (seconds, fractions allowed)\n\
1125 | -e, --full-env\t\tInherit full environment of the parent process\n\
1126 |     --inherit-fds\tInherit all file descriptors of the parent process\n\
1127 | -m, --mem=<size>\tLimit address space to <size> KB\n\
1128 | -M, --meta=<file>\tOutput process information to <file> (name:value)\n\
1129 | -n, --open-files=<max>\tLimit number of open files to <max> (default: 64, 0=unlimited)\n\
1130 | -q, --quota=<blk>,<ino>\tSet disk quota to <blk> blocks and <ino> inodes\n\
1131 |     --share-net\t\tShare network namespace with the parent process\n\
1132 | -s, --silent\t\tDo not print status messages except for fatal errors\n\
1133 |     --special-files\tKeep non-regular files (symlinks etc.) produced inside sandbox\n\
1134 | -k, --stack=<size>\tLimit stack size to <size> KB (default: 0=unlimited)\n\
1135 | -r, --stderr=<file>\tRedirect stderr to <file>\n\
1136 |     --stderr-to-stdout\tRedirect stderr to stdout\n\
1137 | -i, --stdin=<file>\tRedirect stdin from <file>\n\
1138 | -o, --stdout=<file>\tRedirect stdout to <file>\n\
1139 | -p, --processes[=<max>]\tEnable multiple processes (at most <max> of them); needs --cg\n\
1140 | -t, --time=<time>\tSet run time limit (seconds, fractions allowed)\n\
1141 |     --tty-hack\t\tAllow interactive programs in the sandbox (see man for caveats)\n\
1142 | -v, --verbose\t\tBe verbose (use multiple times for even more verbosity)\n\
1143 |     --wait\t\tIf the sandbox is currently busy, wait instead of refusing to run\n\
1144 | -w, --wall-time=<time>\tSet wall clock time limit (seconds, fractions allowed)\n\
1145 | \n\
1146 | Commands:\n\
1147 |     --init\t\tInitialize sandbox (and its control group when --cg is used)\n\
1148 |     --run -- <cmd> ...\tRun given command within sandbox\n\
1149 |     --cleanup\t\tClean up sandbox\n\
1150 |     --print-cg-root\tPrint the root of cgroup hierarchy\n\
1151 |     --version\t\tDisplay program version and configuration\n\
1152 | ");
1153 |   exit(2);
1154 | }
1155 | 
1156 | enum opt_code {
1157 |   OPT_INIT = 256,
1158 |   OPT_RUN,
1159 |   OPT_CLEANUP,
1160 |   OPT_VERSION,
1161 |   OPT_CG,
1162 |   OPT_CG_MEM,
1163 |   OPT_SHARE_NET,
1164 |   OPT_INHERIT_FDS,
1165 |   OPT_STDERR_TO_STDOUT,
1166 |   OPT_TTY_HACK,
1167 |   OPT_CORE,
1168 |   OPT_SPECIAL_FILES,
1169 |   OPT_WAIT,
1170 |   OPT_AS_UID,
1171 |   OPT_AS_GID,
1172 |   OPT_PRINT_CG_ROOT,
1173 | };
1174 | 
1175 | static const char short_opts[] = "b:c:d:DeE:f:i:k:m:M:n:o:p::q:r:st:vw:x:";
1176 | 
1177 | static const struct option long_opts[] = {
1178 |   { "as-uid",		1, NULL, OPT_AS_UID },
1179 |   { "as-gid",		1, NULL, OPT_AS_GID },
1180 |   { "box-id",		1, NULL, 'b' },
1181 |   { "chdir",		1, NULL, 'c' },
1182 |   { "cg",		0, NULL, OPT_CG },
1183 |   { "cg-mem",		1, NULL, OPT_CG_MEM },
1184 |   { "cleanup",		0, NULL, OPT_CLEANUP },
1185 |   { "core",		1, NULL, OPT_CORE },
1186 |   { "dir",		1, NULL, 'd' },
1187 |   { "no-default-dirs",  0, NULL, 'D' },
1188 |   { "fsize",		1, NULL, 'f' },
1189 |   { "env",		1, NULL, 'E' },
1190 |   { "extra-time",	1, NULL, 'x' },
1191 |   { "full-env",		0, NULL, 'e' },
1192 |   { "inherit-fds",	0, NULL, OPT_INHERIT_FDS },
1193 |   { "init",		0, NULL, OPT_INIT },
1194 |   { "mem",		1, NULL, 'm' },
1195 |   { "meta",		1, NULL, 'M' },
1196 |   { "processes",	2, NULL, 'p' },
1197 |   { "quota",		1, NULL, 'q' },
1198 |   { "run",		0, NULL, OPT_RUN },
1199 |   { "share-net",	0, NULL, OPT_SHARE_NET },
1200 |   { "silent",		0, NULL, 's' },
1201 |   { "stack",		1, NULL, 'k' },
1202 |   { "open-files",	1, NULL, 'n' },
1203 |   { "print-cg-root",	0, NULL, OPT_PRINT_CG_ROOT },
1204 |   { "special-files",	0, NULL, OPT_SPECIAL_FILES },
1205 |   { "stderr",		1, NULL, 'r' },
1206 |   { "stderr-to-stdout",	0, NULL, OPT_STDERR_TO_STDOUT },
1207 |   { "stdin",		1, NULL, 'i' },
1208 |   { "stdout",		1, NULL, 'o' },
1209 |   { "time",		1, NULL, 't' },
1210 |   { "tty-hack",		0, NULL, OPT_TTY_HACK },
1211 |   { "verbose",		0, NULL, 'v' },
1212 |   { "version",		0, NULL, OPT_VERSION },
1213 |   { "wait",		0, NULL, OPT_WAIT },
1214 |   { "wall-time",	1, NULL, 'w' },
1215 |   { NULL,		0, NULL, 0 }
1216 | };
1217 | 
1218 | static unsigned int
1219 | opt_uint(char *val)
1220 | {
1221 |   // This accepts unsigned values which also fit within a signed int
1222 |   char *end;
1223 |   errno = 0;
1224 |   unsigned long int x = strtoul(val, &end, 10);
1225 |   if (errno || end == val || end && *end)
1226 |     usage("Invalid numeric parameter: %s\n", val);
1227 |   if (x > INT_MAX)
1228 |     usage("Numeric parameter out of range: %s\n", val);
1229 |   return x;
1230 | }
1231 | 
1232 | int
1233 | main(int argc, char **argv)
1234 | {
1235 |   int c;
1236 |   int require_cg = 0;
1237 |   char *sep;
1238 |   enum opt_code mode = 0;
1239 | 
1240 |   init_dir_rules();
1241 | 
1242 |   while ((c = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0)
1243 |     switch (c)
1244 |       {
1245 |       case 'b':
1246 | 	box_id = opt_uint(optarg);
1247 | 	break;
1248 |       case 'c':
1249 | 	set_cwd = optarg;
1250 | 	break;
1251 |       case OPT_CG:
1252 | 	cg_enable = 1;
1253 | 	break;
1254 |       case 'd':
1255 | 	if (!set_dir_action(optarg))
1256 | 	  usage("Invalid directory rule specified: %s\n", optarg);
1257 | 	break;
1258 |       case 'D':
1259 |         default_dirs = 0;
1260 |         break;
1261 |       case 'e':
1262 | 	pass_environ = 1;
1263 | 	break;
1264 |       case 'E':
1265 | 	if (!set_env_action(optarg))
1266 | 	  usage("Invalid environment specified: %s\n", optarg);
1267 | 	break;
1268 |       case 'f':
1269 |         fsize_limit = opt_uint(optarg);
1270 |         break;
1271 |       case 'k':
1272 | 	stack_limit = opt_uint(optarg);
1273 | 	break;
1274 |       case 'n':
1275 | 	open_file_limit = opt_uint(optarg);
1276 | 	break;
1277 |       case 'i':
1278 | 	redir_stdin = optarg;
1279 | 	break;
1280 |       case 'm':
1281 | 	memory_limit = opt_uint(optarg);
1282 | 	break;
1283 |       case 'M':
1284 | 	meta_open(optarg);
1285 | 	break;
1286 |       case 'o':
1287 | 	redir_stdout = optarg;
1288 | 	break;
1289 |       case 'p':
1290 | 	if (optarg)
1291 | 	  max_processes = opt_uint(optarg);
1292 | 	else
1293 | 	  max_processes = 0;
1294 | 	break;
1295 |       case 'q':
1296 | 	optarg = xstrdup(optarg);
1297 | 	sep = strchr(optarg, ',');
1298 | 	if (!sep)
1299 | 	  usage("Invalid quota specified: %s\n", optarg);
1300 | 	*sep = 0;
1301 | 	block_quota = opt_uint(optarg);
1302 | 	inode_quota = opt_uint(sep+1);
1303 | 	break;
1304 |       case 'r':
1305 | 	redir_stderr = optarg;
1306 | 	redir_stderr_to_stdout = 0;
1307 | 	break;
1308 |       case 's':
1309 | 	silent++;
1310 | 	break;
1311 |       case 't':
1312 | 	timeout = 1000*atof(optarg);
1313 | 	break;
1314 |       case 'v':
1315 | 	verbose++;
1316 | 	break;
1317 |       case 'w':
1318 | 	wall_timeout = 1000*atof(optarg);
1319 | 	break;
1320 |       case 'x':
1321 | 	extra_timeout = 1000*atof(optarg);
1322 | 	break;
1323 |       case OPT_INIT:
1324 |       case OPT_RUN:
1325 |       case OPT_CLEANUP:
1326 |       case OPT_VERSION:
1327 |       case OPT_PRINT_CG_ROOT:
1328 | 	if (!mode || (int) mode == c)
1329 | 	  mode = c;
1330 | 	else
1331 | 	  usage("Only one command is allowed.\n");
1332 | 	break;
1333 |       case OPT_CG_MEM:
1334 | 	cg_memory_limit = opt_uint(optarg);
1335 | 	require_cg = 1;
1336 | 	break;
1337 |       case OPT_SHARE_NET:
1338 | 	share_net = 1;
1339 | 	break;
1340 |       case OPT_INHERIT_FDS:
1341 | 	inherit_fds = 1;
1342 | 	break;
1343 |       case OPT_STDERR_TO_STDOUT:
1344 | 	redir_stderr = NULL;
1345 | 	redir_stderr_to_stdout = 1;
1346 | 	break;
1347 |       case OPT_TTY_HACK:
1348 | 	tty_hack = 1;
1349 | 	break;
1350 |       case OPT_CORE:
1351 | 	core_limit = opt_uint(optarg);
1352 | 	break;
1353 |       case OPT_SPECIAL_FILES:
1354 | 	special_files = true;
1355 | 	break;
1356 |       case OPT_WAIT:
1357 | 	wait_if_busy = true;
1358 | 	break;
1359 |       case OPT_AS_UID:
1360 | 	as_uid = opt_uint(optarg);
1361 | 	break;
1362 |       case OPT_AS_GID:
1363 | 	as_gid = opt_uint(optarg);
1364 | 	break;
1365 |       default:
1366 | 	usage(NULL);
1367 |       }
1368 | 
1369 |   if (!mode)
1370 |     usage("Please specify an isolate command (e.g. --init, --run).\n");
1371 |   if (mode == OPT_VERSION)
1372 |     {
1373 |       show_version();
1374 |       return 0;
1375 |     }
1376 | 
1377 |   if (mode == OPT_PRINT_CG_ROOT)
1378 |     cg_enable = 1;
1379 | 
1380 |   if (require_cg && !cg_enable)
1381 |     usage("Options related to control groups require --cg to be set.\n");
1382 | 
1383 |   get_credentials();
1384 |   umask(022);
1385 |   cf_parse();
1386 |   box_init();
1387 |   cg_init();
1388 | 
1389 |   switch (mode)
1390 |     {
1391 |     case OPT_INIT:
1392 |       if (optind < argc)
1393 | 	usage("--init mode takes no parameters\n");
1394 |       init();
1395 |       break;
1396 |     case OPT_RUN:
1397 |       if (optind >= argc)
1398 | 	usage("--run mode requires a command to run\n");
1399 |       run(argv+optind);
1400 |       break;
1401 |     case OPT_CLEANUP:
1402 |       if (optind < argc)
1403 | 	usage("--cleanup mode takes no parameters\n");
1404 |       cleanup();
1405 |       break;
1406 |     case OPT_PRINT_CG_ROOT:
1407 |       printf("%s\n", cf_cg_root);
1408 |       break;
1409 |     default:
1410 |       die("Internal error: mode mismatch");
1411 |     }
1412 |   exit(0);
1413 | }
1414 | 


--------------------------------------------------------------------------------