├── systemd ├── isolate.slice └── isolate.service.in ├── debian ├── build │ ├── do-test │ ├── container-test │ │ ├── jammy │ │ ├── noble │ │ ├── bookworm │ │ └── trixie │ ├── container-build │ │ ├── jammy │ │ ├── noble │ │ ├── trixie │ │ └── bookworm │ ├── do-build │ └── run ├── isolate.lintian-overrides.trixie ├── isolate.lintian-overrides ├── copyright ├── isolate.postrm ├── isolate.postinst ├── rules ├── rules.trixie ├── control └── changelog ├── .gitignore ├── TODO ├── .travis.yml ├── isolate-cg-keeper.8.txt ├── LICENSE ├── default.cf.in ├── isolate-check-environment.8.txt ├── README.md ├── NEWS ├── isolate.h ├── isolate-cg-keeper.c ├── config.c ├── Makefile ├── cg.c ├── util.c ├── isolate-check-environment ├── rules.c ├── isolate.1.txt └── isolate.c /systemd/isolate.slice: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slice for Isolate's sandboxes 3 | -------------------------------------------------------------------------------- /debian/build/do-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | apt install -y ./isolate_*.deb 4 | -------------------------------------------------------------------------------- /debian/isolate.lintian-overrides.trixie: -------------------------------------------------------------------------------- 1 | elevated-privileges 2 | unknown-section 3 | -------------------------------------------------------------------------------- /debian/build/container-test/jammy: -------------------------------------------------------------------------------- 1 | FROM ubuntu:jammy 2 | 3 | RUN apt update && apt upgrade -y 4 | -------------------------------------------------------------------------------- /debian/build/container-test/noble: -------------------------------------------------------------------------------- 1 | FROM ubuntu:noble 2 | 3 | RUN apt update && apt upgrade -y 4 | -------------------------------------------------------------------------------- /debian/build/container-test/bookworm: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | 3 | RUN apt update && apt upgrade -y 4 | -------------------------------------------------------------------------------- /debian/build/container-test/trixie: -------------------------------------------------------------------------------- 1 | FROM debian:trixie 2 | 3 | RUN apt update && apt upgrade -y 4 | -------------------------------------------------------------------------------- /debian/isolate.lintian-overrides: -------------------------------------------------------------------------------- 1 | elevated-privileges 2 | unknown-section 3 | package-supports-alternative-init-but-no-init.d-script 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | debian/build/build-tmp 2 | default.cf 3 | docbook-xsl.css 4 | isolate 5 | *.[18] 6 | *.[18].html 7 | isolate-cg-keeper 8 | systemd/isolate.service 9 | *.o 10 | -------------------------------------------------------------------------------- /debian/build/container-build/jammy: -------------------------------------------------------------------------------- 1 | FROM ubuntu:jammy 2 | 3 | RUN apt update && apt upgrade -y 4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian 5 | -------------------------------------------------------------------------------- /debian/build/container-build/noble: -------------------------------------------------------------------------------- 1 | FROM ubuntu:noble 2 | 3 | RUN apt update && apt upgrade -y 4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian 5 | -------------------------------------------------------------------------------- /debian/build/container-build/trixie: -------------------------------------------------------------------------------- 1 | FROM debian:trixie 2 | 3 | RUN apt update && apt upgrade -y 4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian 5 | -------------------------------------------------------------------------------- /debian/build/container-build/bookworm: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | 3 | RUN apt update && apt upgrade -y 4 | RUN apt install -y --no-install-recommends build-essential debhelper pkg-config libcap-dev libsystemd-dev asciidoc xmlto lintian 5 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | - Make --inherit-fds accept a list of fd's to inherit 2 | 3 | - use /etc/subuid for the UID range 4 | - but still allow to configure maximum number of sandboxes 5 | less than the size of the range, so that CPU / node 6 | restrictions cannot be bypassed 7 | -------------------------------------------------------------------------------- /systemd/isolate.service.in: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=A trivial daemon to keep Isolate's control group hierarchy 3 | 4 | [Service] 5 | Type=notify 6 | ExecStart=@SBINDIR@/isolate-cg-keeper 7 | Slice=isolate.slice 8 | Delegate=true 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Source: https://github.com/ioi/isolate 3 | Upstream-Name: Isolate 4 | Upstream-Contact: Martin Mareš 5 | 6 | Files: 7 | * 8 | Copyright: 2012-2024 Martin Mareš and Bernard Blackham 9 | License: GPL-2+ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | compiler: gcc 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - asciidoc 9 | - libcap-dev 10 | - libxml2-utils 11 | - xsltproc 12 | - docbook-xml 13 | - docbook-xsl 14 | 15 | script: 16 | - make DESTDIR=/tmp/isolate 17 | - make DESTDIR=/tmp/isolate install 18 | -------------------------------------------------------------------------------- /debian/isolate.postrm: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | case "$1" in 5 | remove) 6 | if getent group isolate >/dev/null ; then 7 | echo "Removing group isolate" 8 | delgroup --quiet --only-if-empty isolate 9 | fi 10 | if dpkg-statoverride --list /usr/bin/isolate >/dev/null 2>&1 ; then 11 | dpkg-statoverride --remove /usr/bin/isolate 12 | fi 13 | ;; 14 | esac 15 | 16 | #DEBHELPER# 17 | 18 | exit 0 19 | -------------------------------------------------------------------------------- /debian/isolate.postinst: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | case "$1" in 5 | configure) 6 | if ! getent group isolate >/dev/null ; then 7 | echo "Adding new group isolate" 8 | addgroup --quiet --system isolate 9 | fi 10 | dpkg-statoverride --list /usr/bin/isolate >/dev/null 2>&1 || 11 | dpkg-statoverride --update --add root isolate 4754 /usr/bin/isolate 12 | ;; 13 | esac 14 | 15 | #DEBHELPER# 16 | 17 | exit 0 18 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | # export DH_VERBOSE = 1 4 | 5 | %: 6 | dh $@ 7 | 8 | override_dh_auto_build: 9 | make all PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc CFLAGS_EXTRA=-g 10 | 11 | override_dh_auto_install: 12 | make install install-doc PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc LIBDIR=/lib DESTDIR=debian/isolate 13 | 14 | override_dh_fixperms: 15 | dh_fixperms --exclude usr/bin/isolate 16 | 17 | override_dh_installsystemd: 18 | dh_installsystemd isolate.service 19 | -------------------------------------------------------------------------------- /debian/rules.trixie: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | # export DH_VERBOSE = 1 4 | 5 | %: 6 | dh $@ 7 | 8 | override_dh_auto_build: 9 | make all PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc CFLAGS_EXTRA=-g 10 | 11 | override_dh_auto_install: 12 | make install install-doc PREFIX=/usr VARPREFIX=/var CONFIGDIR=/etc LIBDIR=/usr/lib DESTDIR=debian/isolate 13 | 14 | override_dh_fixperms: 15 | dh_fixperms --exclude usr/bin/isolate 16 | 17 | override_dh_installsystemd: 18 | dh_installsystemd isolate.service 19 | -------------------------------------------------------------------------------- /debian/build/do-build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | cd isolate 4 | . /etc/os-release 5 | for fixup in $(find debian -name "*.$VERSION_CODENAME") ; do 6 | echo "Applying fixup: $fixup" 7 | mv $fixup ${fixup%.$VERSION_CODENAME} 8 | done 9 | if [ "$ID" = debian ] ; then 10 | # We currently run linitian only on Debian 11 | dpkg-buildpackage -b -uc -tc --check-command=lintian --check-option=-i --check-option=--fail-on=warning,error --check-option=--allow-root 12 | else 13 | dpkg-buildpackage -b -uc -tc 14 | fi 15 | -------------------------------------------------------------------------------- /isolate-cg-keeper.8.txt: -------------------------------------------------------------------------------- 1 | ISOLATE-CG-KEEPER(8) 2 | ==================== 3 | 4 | NAME 5 | ---- 6 | isolate-cg-keeper - A helper daemon for keeping cgroups alive 7 | 8 | SYNOPSIS 9 | -------- 10 | *isolate-cg-keeper* 11 | 12 | DESCRIPTION 13 | ----------- 14 | To use *isolate*(1) on a system with *systemd*(1), it is necessary to have a subtree of the 15 | control group tree delegated to Isolate. 16 | 17 | This is accomplished by running a service (`isolate.service`) that keeps alive a slice (`isolate.slice`) 18 | using the *isolate-cg-keeper* daemon. 19 | 20 | SEE ALSO 21 | -------- 22 | *isolate*(1) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Isolate is free software: you can redistribute it and/or modify 2 | it under the terms of the GNU General Public License as published by 3 | the Free Software Foundation, either version 2 of the License, or 4 | (at your option) any later version. 5 | 6 | This program is distributed in the hope that it will be useful, 7 | but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | GNU General Public License for more details. 10 | 11 | If you have less than 10 copies of the GPL on your system :-), 12 | you can find it at http://www.gnu.org/licenses/. 13 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: isolate 2 | Section: ucw 3 | Priority: optional 4 | Maintainer: Martin Mareš 5 | Standards-Version: 4.6.0.1 6 | Build-Depends: debhelper (>= 13.0), debhelper-compat (= 13), 7 | pkg-config, libcap-dev, libsystemd-dev, asciidoc 8 | 9 | Package: isolate 10 | Architecture: any 11 | Description: Sandbox for programming contests 12 | Isolate is a sandbox built to safely run untrusted executables, like 13 | programs submitted by competitors in a programming contest. Isolate 14 | gives them a limited-access environment, preventing them from affecting 15 | the host system. It takes advantage of features specific to the Linux 16 | kernel, like namespaces and control groups. 17 | Depends: adduser, ${shlibs:Depends} 18 | -------------------------------------------------------------------------------- /default.cf.in: -------------------------------------------------------------------------------- 1 | # This is a configuration file for Isolate 2 | 3 | # All sandboxes are created under this directory. 4 | # To avoid symlink attacks, this directory and all its ancestors 5 | # must be writeable only to root. 6 | box_root = @BOXDIR@ 7 | 8 | # Directory where lock files are created. 9 | lock_root = /run/isolate/locks 10 | 11 | # Control group under which we place our subgroups 12 | # Either an explicit path to a subdirectory in cgroupfs, or "auto:file" to read 13 | # the path from "file", where it is put by isolate-cg-helper. 14 | # cg_root = /sys/fs/cgroup/isolate.slice/isolate.service 15 | cg_root = auto:/run/isolate/cgroup 16 | 17 | # Block of UIDs and GIDs reserved for sandboxes 18 | first_uid = 60000 19 | first_gid = 60000 20 | num_boxes = 1000 21 | 22 | # Only root can create new sandboxes (default: 0=everybody can) 23 | #restricted_init = 1 24 | 25 | # Per-box settings of the set of allowed CPUs and NUMA nodes 26 | # (see linux/Documentation/cgroups/cpusets.txt for precise syntax) 27 | 28 | #box0.cpus = 4-7 29 | #box0.mems = 1 30 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | isolate (2.2.1) stable; urgency=medium 2 | 3 | * New upstream release. 4 | 5 | -- Martin Mares Wed, 01 Oct 2025 17:48:31 +0200 6 | 7 | isolate (2.2) stable; urgency=medium 8 | 9 | * New upstream release. 10 | 11 | * Added a dependency on useradd. 12 | 13 | -- Martin Mares Mon, 01 Sep 2025 11:44:24 +0200 14 | 15 | isolate (2.1.2) stable; urgency=medium 16 | 17 | * New upstream release. 18 | 19 | -- Martin Mares Thu, 14 Aug 2025 15:47:24 +0200 20 | 21 | isolate (2.1) stable; urgency=medium 22 | 23 | * New upstream release. 24 | 25 | * Isolate is available only to users who are members of the 26 | "isolate" group. 27 | 28 | -- Martin Mares Sun, 08 Jun 2025 02:31:41 +0200 29 | 30 | isolate (2.0-1) stable; urgency=medium 31 | 32 | * isolate-check-environment does not spew error messages 33 | if $TERM is not defined. 34 | 35 | -- Martin Mares Fri, 21 Jun 2024 20:39:17 +0200 36 | 37 | isolate (2.0) stable; urgency=medium 38 | 39 | * Initial release. 40 | 41 | -- Martin Mares Tue, 11 Jun 2024 16:44:34 +0200 42 | -------------------------------------------------------------------------------- /isolate-check-environment.8.txt: -------------------------------------------------------------------------------- 1 | ISOLATE-CHECK-ENVIRONMENT(8) 2 | ============================ 3 | 4 | NAME 5 | ---- 6 | isolate-check-environment - Check for common environment quirks 7 | 8 | SYNOPSIS 9 | -------- 10 | *isolate-check-environment* [*-q*|*--quiet*] [*e*|*--execute*] 11 | 12 | DESCRIPTION 13 | ----------- 14 | This script can be used to identify sources of run-time variability and other issues on 15 | Linux machines which may affect *isolate*(1). 16 | 17 | If *--execute* is not specified, the recommended actions are written to stdout as an executable 18 | shell script. With *--execute*, the script will attempt to make changes to make the system 19 | behave more deterministically. 20 | 21 | The changes performed by *--execute* persist only 22 | until a reboot. To persist across reboots, the standard output from this script 23 | should be added to `/etc/rc.local` or some other script that is run on each boot. 24 | Alternately, you could execute *isolate-check-environment --quiet --execute* 25 | from `rc.local`, but use this with caution as not all issues can be resolved in this way. 26 | 27 | The exit status will be 0 if all checks pass, or 1 if some checks have failed. 28 | 29 | Note that there are more strategies to reduce run-time variability further. 30 | See *isolate*(1) for details under REPRODUCIBILITY. 31 | 32 | SEE ALSO 33 | -------- 34 | *isolate*(1) 35 | -------------------------------------------------------------------------------- /debian/build/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | BUILD_DIR=build-tmp 5 | 6 | build () 7 | { 8 | local BUILD_IMAGE=isolate-build-$ARCH-$SUITE 9 | local TEST_IMAGE=isolate-test-$ARCH-$SUITE 10 | local PLATFORM="--platform linux/$ARCH" 11 | 12 | echo "### Building packages for $SUITE/$ARCH" 13 | 14 | echo "# Updating build container" 15 | podman build $PLATFORM --file container-build/$SUITE --tag $BUILD_IMAGE 16 | 17 | echo "# Updating build container" 18 | podman build $PLATFORM --file container-test/$SUITE --tag $TEST_IMAGE 19 | 20 | echo "# Creating build directory" 21 | rm -rf $BDIR 22 | mkdir -p $BDIR 23 | cp do-build do-test $BDIR/ 24 | ( cd ../.. && git archive --prefix=isolate/ HEAD ) | ( cd $BDIR && tar x ) 25 | 26 | echo "# Building" 27 | podman run $PLATFORM -it --rm --volume ./$BDIR:/build --workdir /build $BUILD_IMAGE ./do-build 28 | 29 | echo "# Testing" 30 | podman run $PLATFORM -it --rm --volume ./$BDIR:/build --workdir /build $BUILD_IMAGE ./do-test 31 | } 32 | 33 | publish () 34 | { 35 | echo "### Publishing packages for $SUITE/$ARCH" 36 | rsync $BDIR/*.deb jw:/projects/isolate/www/debian/dists/$SUITE-isolate/main/binary-$ARCH/ 37 | } 38 | 39 | try_dist () 40 | { 41 | local SUITE=$1 42 | local ARCH=$2 43 | local BDIR=$BUILD_DIR/$ARCH-$SUITE 44 | $STEP 45 | } 46 | 47 | try_all_dists () 48 | { 49 | local STEP=$1 50 | 51 | try_dist bookworm amd64 52 | try_dist trixie amd64 53 | try_dist noble amd64 54 | try_dist jammy amd64 55 | 56 | # Cross-building for amd64 requires qemu-user-static installed 57 | try_dist bookworm arm64 58 | try_dist trixie arm64 59 | } 60 | 61 | try_all_dists build 62 | 63 | echo -n "Press Enter to publish packages or Ctrl-C to abort ... " 64 | read ENTER 65 | 66 | try_all_dists publish 67 | 68 | echo "### Updating package index" 69 | ssh -t jw 'cd /projects/isolate/debian && ./genpkg' 70 | 71 | echo "### Cleaning up" 72 | rm -rf $BUILD_DIR 73 | 74 | echo "### Done" 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | isolate 2 | ======= 3 | 4 | Isolate is a sandbox built to safely run untrusted executables, like 5 | programs submitted by competitors in a programming contest. Isolate 6 | gives them a limited-access environment, preventing them from affecting 7 | the host system. It takes advantage of features specific to the Linux 8 | kernel, like namespaces and control groups. 9 | 10 | Isolate was developed by Martin Mareš () and Bernard Blackham 11 | () and still maintained by the former author. 12 | Several other people contributed patches for features and bug fixes 13 | (see Git history for a list). Thanks! 14 | 15 | Originally, Isolate was a part of the [Moe Contest Environment](http://www.ucw.cz/moe/), 16 | but it evolved to a separate project used by different 17 | contest systems, most prominently [CMS](https://github.com/cms-dev/cms). 18 | It now lives at [GitHub](https://github.com/ioi/isolate), 19 | where you can submit bug reports and feature requests. 20 | 21 | If you are interested in more details, please read Martin's and Bernard's 22 | papers on [Isolate's design](https://mj.ucw.cz/papers/isolate.pdf) and 23 | [grading system security](https://mj.ucw.cz/papers/secgrad.pdf) published 24 | in the Olympiads in Informatics journal. 25 | Also, Isolate's [manual page](http://www.ucw.cz/isolate/isolate.1.html) 26 | is available online. 27 | 28 | ## Installing Isolate 29 | 30 | To compile Isolate, you need: 31 | 32 | - pkg-config 33 | 34 | - headers for the libcap library (usually available in a libcap-dev package) 35 | 36 | - headers for the libsystemd library (libsystemd-dev package) for compilation 37 | of isolate-cg-keeper 38 | 39 | You may need `a2x` (found in [AsciiDoc](https://asciidoc-py.github.io/a2x.1.html)) for building manual. 40 | But if you only want the isolate binary, you can just run `make isolate` 41 | 42 | Recommended system setup is described in sections INSTALLATION and REPRODUCIBILITY 43 | of the manual page. 44 | 45 | ## Debian packages 46 | 47 | Isolate is also available as packages for stable Debian Linux and last two LTS 48 | releases of Ubuntu, all on the amd64 architecture. To use them, add the following 49 | to your `/etc/apt/sources.list`: 50 | 51 | deb [arch=amd64 signed-by=/etc/apt/keyrings/isolate.asc] http://www.ucw.cz/isolate/debian/ bookworm-isolate main 52 | 53 | You also need to install the repository's public key: 54 | 55 | curl https://www.ucw.cz/isolate/debian/signing-key.asc >/etc/apt/keyrings/isolate.asc 56 | 57 | Then invoke: 58 | 59 | apt update && apt install isolate 60 | 61 | There are experimental packages for the arm64 architecture, too. 62 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Version 2.2.1 [2025-09-29] 2 | 3 | * Fixed the check for asymmetric cores in isolate-check-environment. 4 | 5 | Version 2.2 [2025-09-01] 6 | 7 | * Switched to a new kernel API for setting filesystem quotas, 8 | which works with more filesystems (e.g., tmpfs). 9 | 10 | * Wall-clock time is reported correctly even if the system 11 | clock is re-set during program execution. 12 | 13 | Version 2.1.2 [2025-08-14] 14 | 15 | * Build date and commit are recorded in built binaries only 16 | when building from a Git checkout. 17 | 18 | Version 2.1.1 [2025-08-14] 19 | 20 | * isolate-check-environment checks for asymmetric cores. 21 | 22 | * By mistake, isolate was compiled without optimization. 23 | Added -O2 to CFLAGS. 24 | 25 | * Added packaging for Debian Trixie on both amd64 and arm64. 26 | 27 | Version 2.1 [2025-06-08] 28 | 29 | • There are official packages for Debian Bookworm (amd64, arm64) 30 | and last two LTS releases of Ubuntu (amd64 only). Packaged Isolate 31 | takes care of starting systemd services properly. It is available 32 | only to users that are members of "isolate" group. 33 | 34 | • Unit files for systemd are installed by default. 35 | 36 | • Cgroup-based timing works when --run is used multiple times 37 | on the same sandbox. 38 | 39 | • Added manual pages for isolate-cg-keeper and isolate-check-environment. 40 | The man page for isolate explains more about cgroups and containers. 41 | 42 | • isolate-check-environment checks presence of simultaneous 43 | multi-threading. 44 | 45 | • All binaries are compiled with security hardening flags. 46 | 47 | • Minor bug fixes. 48 | 49 | Version 2.0 [2024-02-28] 50 | 51 | • This version runs only on systems supporting CGroup v2, which are 52 | basically all new Linux systems. If you need to stick with CGroup v1, 53 | please use Isolate 1.10.1. 54 | 55 | • If you are running systemd, Isolate now comes with isolate.service 56 | that delegates a subtree of the cgroup hierarchy to Isolate. 57 | On systems without systemd, you have to set up the delegation 58 | yourself and set its root in Isolate's configuration file. 59 | 60 | • The --cg-timing switch has been removed. In control group mode, 61 | this mode of timing is always used. 62 | 63 | • Added a simple protocol for locking sandboxes. When a sandbox is 64 | initialized using "isolate --init", it is reserved for the calling 65 | user until "isolate --cleanup" is used. It is also not allowed 66 | to call "isolate --run" multiple times in parallel on the same box. 67 | 68 | • "isolate --init" resets the sandbox if it already existed. 69 | 70 | • Root can operate sandboxes on behalf of other users using 71 | --as-uid and --as-gid options. 72 | 73 | • Configuration can specify than only root is allowed to create new 74 | sandboxes. Together with the previous feature, it allows for creation 75 | of system-wide daemons allocating sandboxes to users. One such daemon 76 | will probably appear in a future release of Isolate. 77 | -------------------------------------------------------------------------------- /isolate.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Process Isolator 3 | * 4 | * (c) 2012-2024 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define NONRET __attribute__((noreturn)) 15 | #define UNUSED __attribute__((unused)) 16 | #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) 17 | 18 | /* isolate.c */ 19 | 20 | void NONRET __attribute__((format(printf,1,2))) die(char *msg, ...); 21 | void NONRET __attribute__((format(printf,1,2))) err(char *msg, ...); 22 | void __attribute__((format(printf,1,2))) msg(char *msg, ...); 23 | 24 | extern int pass_environ; 25 | extern int verbose; 26 | extern int block_quota; 27 | extern int inode_quota; 28 | extern int cg_enable; 29 | extern int cg_memory_limit; 30 | 31 | extern int box_id; 32 | extern uid_t box_uid, orig_uid; 33 | extern gid_t box_gid, orig_gid; 34 | 35 | /* util.c */ 36 | 37 | void *xmalloc(size_t size); 38 | char *xstrdup(char *str); 39 | char * __attribute__((format(printf,1,2))) xsprintf(const char *fmt, ...); 40 | 41 | void timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *result); 42 | 43 | int dir_exists(char *path); 44 | void rmtree(char *path); 45 | void make_dir(char *path); 46 | void make_dir_for(char *path); 47 | void chowntree(char *path, uid_t uid, gid_t gid, bool keep_special_files); 48 | void keep_fd(int fd); 49 | void close_all_fds(void); 50 | 51 | void meta_open(const char *name); 52 | void meta_close(void); 53 | void __attribute__((format(printf,1,2))) meta_printf(const char *fmt, ...); 54 | 55 | /* rules.c */ 56 | 57 | int set_env_action(char *a0); 58 | char **setup_environment(void); 59 | 60 | void init_dir_rules(void); 61 | int set_dir_action(char *arg); 62 | void apply_dir_rules(int with_defaults); 63 | 64 | void set_quota(void); 65 | 66 | /* cg.c (without cg_enable, these functions do nothing) */ 67 | 68 | // Initialize CG machinery 69 | void cg_init(void); 70 | 71 | // Create a new CG for the box (during isolate --init) 72 | void cg_create(void); 73 | 74 | // Destroy the box CG (during isolate --cleanup) 75 | void cg_remove(void); 76 | 77 | // Prepare the box CG for use (during isolate --run) 78 | void cg_setup(void); 79 | 80 | // Move the current process to the box CG 81 | void cg_enter(void); 82 | 83 | // Obtain statistics on the box CG 84 | int cg_get_run_time_ms(void); 85 | void cg_stats(void); 86 | 87 | /* config.c */ 88 | 89 | extern char *cf_box_root; 90 | extern char *cf_lock_root; 91 | extern char *cf_cg_root; 92 | extern int cf_first_uid; 93 | extern int cf_first_gid; 94 | extern int cf_num_boxes; 95 | extern int cf_restricted_init; 96 | 97 | struct cf_per_box { 98 | struct cf_per_box *next; 99 | int box_id; 100 | char *cpus; 101 | char *mems; 102 | }; 103 | 104 | void cf_parse(void); 105 | struct cf_per_box *cf_per_box(int box_id); 106 | 107 | static inline struct cf_per_box * 108 | cf_current_box(void) 109 | { 110 | return cf_per_box(box_id); 111 | } 112 | -------------------------------------------------------------------------------- /isolate-cg-keeper.c: -------------------------------------------------------------------------------- 1 | /* 2 | * A Trivial Helper Daemon for Keeping Control Groups in SystemD 3 | * 4 | * (c) 2022--2024 Martin Mares 5 | */ 6 | 7 | #include "isolate.h" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #define CGROUP_FS "/sys/fs/cgroup" 18 | 19 | void NONRET __attribute__((format(printf,1,2))) 20 | die(char *msg, ...) 21 | { 22 | va_list args; 23 | va_start(args, msg); 24 | vfprintf(stderr, msg, args); 25 | fputc('\n', stderr); 26 | exit(1); 27 | } 28 | 29 | static void __attribute__((format(printf,3,4))) 30 | write_cg_attr(const char *cg_root, const char *name, const char *fmt, ...) 31 | { 32 | va_list args; 33 | va_start(args, fmt); 34 | 35 | char namebuf[1024]; 36 | snprintf(namebuf, sizeof(namebuf), "%s/%s", cg_root, name); 37 | 38 | char valbuf[1024]; 39 | vsnprintf(valbuf, sizeof(valbuf), fmt, args); 40 | int len = strlen(valbuf); 41 | 42 | int fd = open(namebuf, O_WRONLY); 43 | if (fd < 0) 44 | die("Cannot open %s: %m", namebuf); 45 | 46 | if (write(fd, valbuf, len) != len) 47 | die("Cannot write to %s: %m", namebuf); 48 | 49 | close(fd); 50 | va_end(args); 51 | } 52 | 53 | static void 54 | check_cgroup_fs(void) 55 | { 56 | struct stat st; 57 | 58 | if (stat(CGROUP_FS, &st) < 0) 59 | die("Cannot find %s: %m", CGROUP_FS); 60 | 61 | if (stat(CGROUP_FS "/unified", &st) >= 0) 62 | die("Combined cgroup v1+v2 mode is not supported"); 63 | 64 | if (stat(CGROUP_FS "/cgroup.subtree_control", &st) < 0) 65 | die("Cgroup v2 not found"); 66 | } 67 | 68 | static char * 69 | get_my_cgroup(void) 70 | { 71 | FILE *f = fopen("/proc/self/cgroup", "r"); 72 | if (!f) 73 | die("Cannot open /proc/self/cgroup: %m"); 74 | 75 | char *line = NULL; 76 | size_t buflen = 0; 77 | ssize_t len; 78 | char *cg = NULL; 79 | 80 | while ((len = getline(&line, &buflen, f)) >= 0) 81 | { 82 | if (len > 0 && line[len-1] == '\n') 83 | line[--len] = 0; 84 | if (line[0] == '0' && line[1] == ':' && line[2] == ':') 85 | { 86 | cg = xsprintf(CGROUP_FS "%s", line + 3); 87 | break; 88 | } 89 | } 90 | 91 | if (!cg) 92 | die("Cannot find my own cgroup"); 93 | 94 | free(line); 95 | fclose(f); 96 | return cg; 97 | } 98 | 99 | static void 100 | write_auto_cgroup(char *file, char *cg) 101 | { 102 | make_dir_for(file); 103 | 104 | FILE *f = fopen(file, "w"); 105 | if (!f) 106 | die("Cannot create %s: %m", file); 107 | fprintf(f, "%s\n", cg); 108 | fclose(f); 109 | } 110 | 111 | static void 112 | setup_cg(void) 113 | { 114 | char *cg = cf_cg_root; 115 | if (strlen(cf_cg_root) > 5 && !memcmp(cf_cg_root, "auto:", 5)) 116 | { 117 | check_cgroup_fs(); 118 | cg = get_my_cgroup(); 119 | write_auto_cgroup(cf_cg_root + 5, cg); 120 | } 121 | 122 | struct stat st; 123 | if (stat(cg, &st), 0) 124 | die("Control group root %s does not exist: %m", cg); 125 | 126 | char subgroup[1024]; 127 | snprintf(subgroup, sizeof(subgroup), "%s/daemon", cg); 128 | if (mkdir(subgroup, 0777) < 0) 129 | die("Cannot create subgroup %s: %m", subgroup); 130 | 131 | write_cg_attr(cg, "daemon/cgroup.procs", "%d\n", (int) getpid()); 132 | write_cg_attr(cg, "cgroup.subtree_control", "+cpuset +memory\n"); 133 | } 134 | 135 | int 136 | main(int argc UNUSED, char **argv UNUSED) 137 | { 138 | cf_parse(); 139 | setup_cg(); 140 | sd_notify(0, "READY=1"); 141 | for (;;) 142 | pause(); 143 | } 144 | -------------------------------------------------------------------------------- /config.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Process Isolator -- Configuration File 3 | * 4 | * (c) 2016--2023 Martin Mares 5 | */ 6 | 7 | #include "isolate.h" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define MAX_LINE_LEN 1024 15 | 16 | char *cf_box_root; 17 | char *cf_lock_root; 18 | char *cf_cg_root; 19 | int cf_first_uid; 20 | int cf_first_gid; 21 | int cf_num_boxes; 22 | int cf_restricted_init; 23 | 24 | static int line_number; 25 | static struct cf_per_box *per_box_configs; 26 | 27 | static void NONRET 28 | cf_err(char *msg) 29 | { 30 | die("Error in config file, line %d: %s", line_number, msg); 31 | } 32 | 33 | static char * 34 | cf_string(char *val) 35 | { 36 | return xstrdup(val); 37 | } 38 | 39 | static int 40 | cf_int(char *val) 41 | { 42 | char *end; 43 | errno = 0; 44 | long int x = strtol(val, &end, 10); 45 | if (errno || end == val || end && *end) 46 | cf_err("Invalid number"); 47 | if ((long int)(int) x != x) 48 | cf_err("Number out of range"); 49 | return x; 50 | } 51 | 52 | static void 53 | cf_entry_toplevel(char *key, char *val) 54 | { 55 | if (!strcmp(key, "box_root")) 56 | cf_box_root = cf_string(val); 57 | else if (!strcmp(key, "lock_root")) 58 | cf_lock_root = cf_string(val); 59 | else if (!strcmp(key, "cg_root")) 60 | cf_cg_root = cf_string(val); 61 | else if (!strcmp(key, "first_uid")) 62 | cf_first_uid = cf_int(val); 63 | else if (!strcmp(key, "first_gid")) 64 | cf_first_gid = cf_int(val); 65 | else if (!strcmp(key, "num_boxes")) 66 | cf_num_boxes = cf_int(val); 67 | else if (!strcmp(key, "restricted_init")) 68 | cf_restricted_init = cf_int(val); 69 | else 70 | cf_err("Unknown configuration item"); 71 | } 72 | 73 | static void 74 | cf_entry_compound(char *key, char *subkey, char *val) 75 | { 76 | if (strncmp(key, "box", 3)) 77 | cf_err("Unknown configuration section"); 78 | int box_id = cf_int(key + 3); 79 | struct cf_per_box *c = cf_per_box(box_id); 80 | 81 | if (!strcmp(subkey, "cpus")) 82 | c->cpus = cf_string(val); 83 | else if (!strcmp(subkey, "mems")) 84 | c->mems = cf_string(val); 85 | else 86 | cf_err("Unknown per-box configuration item"); 87 | } 88 | 89 | static void 90 | cf_entry(char *key, char *val) 91 | { 92 | char *dot = strchr(key, '.'); 93 | if (!dot) 94 | cf_entry_toplevel(key, val); 95 | else 96 | { 97 | *dot++ = 0; 98 | cf_entry_compound(key, dot, val); 99 | } 100 | } 101 | 102 | static void 103 | cf_check(void) 104 | { 105 | if (!cf_box_root || 106 | !cf_lock_root || 107 | !cf_cg_root || 108 | !cf_first_uid || 109 | !cf_first_gid || 110 | !cf_num_boxes) 111 | cf_err("Configuration is not complete"); 112 | } 113 | 114 | void 115 | cf_parse(void) 116 | { 117 | FILE *f = fopen(CONFIG_FILE, "r"); 118 | if (!f) 119 | die("Cannot open %s: %m", CONFIG_FILE); 120 | 121 | char line[MAX_LINE_LEN]; 122 | while (fgets(line, sizeof(line), f)) 123 | { 124 | line_number++; 125 | char *nl = strchr(line, '\n'); 126 | if (!nl) 127 | cf_err("Line not terminated or too long"); 128 | *nl = 0; 129 | 130 | if (!line[0] || line[0] == '#') 131 | continue; 132 | 133 | char *s = line; 134 | while (*s && *s != ' ' && *s != '\t' && *s != '=') 135 | s++; 136 | while (*s == ' ' || *s == '\t') 137 | *s++ = 0; 138 | if (*s != '=') 139 | cf_err("Syntax error, expecting key=value"); 140 | *s++ = 0; 141 | while (*s == ' ' || *s == '\t') 142 | *s++ = 0; 143 | 144 | cf_entry(line, s); 145 | } 146 | 147 | fclose(f); 148 | cf_check(); 149 | } 150 | 151 | struct cf_per_box * 152 | cf_per_box(int box_id) 153 | { 154 | struct cf_per_box *c; 155 | 156 | for (c = per_box_configs; c; c = c->next) 157 | if (c->box_id == box_id) 158 | return c; 159 | 160 | c = xmalloc(sizeof(*c)); 161 | memset(c, 0, sizeof(*c)); 162 | c->next = per_box_configs; 163 | per_box_configs = c; 164 | c->box_id = box_id; 165 | return c; 166 | } 167 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Isolate 2 | # (c) 2015--2025 Martin Mares 3 | # (c) 2017 Bernard Blackham 4 | 5 | VERSION=2.2.1 6 | YEAR=2025 7 | 8 | PROGRAMS=isolate isolate-check-environment isolate-cg-keeper 9 | MANPAGES=isolate.1 isolate-check-environment.8 isolate-cg-keeper.8 10 | CONFIGS=default.cf systemd/isolate.slice systemd/isolate.service 11 | 12 | all: $(PROGRAMS) $(MANPAGES) $(addsuffix .html, $(MANPAGES)) $(CONFIGS) 13 | 14 | CC=gcc 15 | CFLAGS=-std=gnu99 -O2 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers -Wstrict-prototypes -Wmissing-prototypes $(CFLAGS_HARDEN) -D_GNU_SOURCE $(CFLAGS_EXTRA) 16 | LDFLAGS=$(LDFLAGS_HARDEN) 17 | LIBS=-lcap 18 | 19 | # Inspiration: https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html 20 | CFLAGS_HARDEN=-D_FORTIFY_SOURCE=3 -fstack-protector-strong -fstack-clash-protection -fPIE -pie 21 | LDFLAGS_HARDEN=-Wl,-z,nodlopen -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now 22 | 23 | CFLAGS_BUILD=-DISOLATE_VERSION='"$(VERSION)"' -DISOLATE_YEAR='"$(YEAR)"' 24 | 25 | # If we are building from a checked out repository, include build date and commit 26 | BUILD_FROM_GIT := $(shell if [ -d .git ] ; then echo yes ; fi) 27 | ifdef BUILD_FROM_GIT 28 | BUILD_DATE := $(shell date '+%Y-%m-%d') 29 | BUILD_COMMIT := $(shell if git rev-parse >/dev/null 2>/dev/null ; then git describe --always --tags ; else echo '' ; fi) 30 | CFLAGS_BUILD += -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"' 31 | endif 32 | 33 | PREFIX = /usr/local 34 | VARPREFIX = /var/local 35 | CONFIGDIR = $(PREFIX)/etc 36 | CONFIG = $(CONFIGDIR)/isolate 37 | BINDIR = $(PREFIX)/bin 38 | LIBDIR = $(PREFIX)/lib 39 | SBINDIR = $(PREFIX)/sbin 40 | DATADIR = $(PREFIX)/share 41 | MANDIR = $(DATADIR)/man 42 | MAN1DIR = $(MANDIR)/man1 43 | MAN8DIR = $(MANDIR)/man8 44 | BOXDIR = $(VARPREFIX)/lib/isolate 45 | UNITDIR = $(LIBDIR)/systemd/system 46 | 47 | SYSTEMD_CFLAGS := $(shell pkg-config libsystemd --cflags) 48 | SYSTEMD_LIBS := $(shell pkg-config libsystemd --libs) 49 | 50 | isolate: isolate.o util.o rules.o cg.o config.o 51 | $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) 52 | 53 | isolate-cg-keeper: isolate-cg-keeper.o config.o util.o 54 | $(CC) $(LDFLAGS) -o $@ $^ $(SYSTEMD_LIBS) 55 | 56 | %.o: %.c isolate.h 57 | $(CC) $(CFLAGS) -c -o $@ $< 58 | 59 | isolate.o: CFLAGS += $(CFLAGS_BUILD) 60 | config.o: CFLAGS += -DCONFIG_FILE='"$(CONFIG)"' 61 | isolate-cg-keeper.o: CFLAGS += $(SYSTEMD_CFLAGS) 62 | 63 | %.1: %.1.txt 64 | a2x -f manpage $< 65 | 66 | %.8: %.8.txt 67 | a2x -f manpage $< 68 | 69 | # The dependency on %.1 is there to serialize both calls of asciidoc, 70 | # which does not name temporary files safely. 71 | %.1.html: %.1.txt %.1 72 | a2x -f xhtml -D . $< 73 | 74 | %.8.html: %.8.txt %.8 75 | a2x -f xhtml -D . $< 76 | 77 | %: %.in 78 | sed "s|@SBINDIR@|$(SBINDIR)|g; s|@BOXDIR@|$(BOXDIR)|g" <$< >$@ 79 | 80 | clean: 81 | rm -f *.o 82 | rm -f isolate isolate-cg-keeper 83 | rm -f $(MANPAGES) $(addsuffix .html, $(MANPAGES)) 84 | rm -f docbook-xsl.css 85 | rm -f default.cf 86 | rm -f systemd/isolate.service 87 | 88 | install: $(PROGRAMS) $(CONFIGS) 89 | install -d $(DESTDIR)$(BINDIR) $(DESTDIR)$(SBINDIR) $(DESTDIR)$(BOXDIR) $(DESTDIR)$(CONFIGDIR) $(DESTDIR)$(UNITDIR) 90 | install isolate-check-environment $(DESTDIR)$(BINDIR) 91 | install isolate-cg-keeper $(DESTDIR)$(SBINDIR) 92 | install -m 4755 isolate $(DESTDIR)$(BINDIR) 93 | install -m 644 default.cf $(DESTDIR)$(CONFIG) 94 | install -m 644 systemd/isolate.slice systemd/isolate.service $(DESTDIR)$(UNITDIR) 95 | 96 | install-doc: $(MANPAGES) 97 | install -d $(DESTDIR)$(MAN1DIR) $(DESTDIR)$(MAN8DIR) 98 | install -m 644 isolate.1 $(DESTDIR)$(MAN1DIR)/ 99 | install -m 644 isolate-check-environment.8 isolate-cg-keeper.8 $(DESTDIR)$(MAN8DIR)/ 100 | 101 | release: $(addsuffix .html,$(MANPAGES)) 102 | git tag v$(VERSION) 103 | git push --tags 104 | git archive --format=tar --prefix=isolate-$(VERSION)/ HEAD | gzip >isolate-$(VERSION).tar.gz 105 | rsync isolate-$(VERSION).tar.gz jw:/home/ftp/pub/mj/isolate/ 106 | rsync $(addsuffix .html,$(MANPAGES)) jw:/projects/isolate/www/ 107 | ssh jw 'cd web && bin/release-prog isolate $(VERSION)' 108 | 109 | .PHONY: all clean install install-doc release 110 | -------------------------------------------------------------------------------- /cg.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Process Isolator -- Control Groups 3 | * 4 | * (c) 2012-2024 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #include "isolate.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | static char cg_name[256]; 21 | 22 | #define CG_BUFSIZE 1024 23 | 24 | static void 25 | cg_makepath(char *buf, size_t len, const char *attr) 26 | { 27 | int out; 28 | if (attr) 29 | out = snprintf(buf, len, "%s/%s/%s", cf_cg_root, cg_name, attr); 30 | else 31 | out = snprintf(buf, len, "%s/%s", cf_cg_root, cg_name); 32 | assert((size_t) out < len); 33 | } 34 | 35 | static int 36 | cg_read(const char *attr, char *buf) 37 | { 38 | int result = 0; 39 | int maybe = 0; 40 | if (attr[0] == '?') 41 | { 42 | attr++; 43 | maybe = 1; 44 | } 45 | 46 | char path[PATH_MAX]; 47 | cg_makepath(path, sizeof(path), attr); 48 | 49 | int fd = open(path, O_RDONLY); 50 | if (fd < 0) 51 | { 52 | if (maybe) 53 | goto fail; 54 | die("Cannot read %s: %m", path); 55 | } 56 | 57 | int n = read(fd, buf, CG_BUFSIZE); 58 | if (n < 0) 59 | { 60 | if (maybe) 61 | goto fail_close; 62 | die("Cannot read %s: %m", path); 63 | } 64 | if (n >= CG_BUFSIZE - 1) 65 | die("Attribute %s too long", path); 66 | if (n > 0 && buf[n-1] == '\n') 67 | n--; 68 | buf[n] = 0; 69 | 70 | if (verbose > 1) 71 | msg("CG: Read %s = <%s>\n", attr, buf); 72 | 73 | result = 1; 74 | fail_close: 75 | close(fd); 76 | fail: 77 | return result; 78 | } 79 | 80 | static void __attribute__((format(printf,2,3))) 81 | cg_write(const char *attr, const char *fmt, ...) 82 | { 83 | int maybe = 0; 84 | if (attr[0] == '?') 85 | { 86 | attr++; 87 | maybe = 1; 88 | } 89 | 90 | va_list args; 91 | va_start(args, fmt); 92 | 93 | char buf[CG_BUFSIZE]; 94 | int n = vsnprintf(buf, sizeof(buf), fmt, args); 95 | if (n >= CG_BUFSIZE) 96 | die("cg_write: Value for attribute %s is too long", attr); 97 | 98 | if (verbose > 1) 99 | msg("CG: Write %s = %s", attr, buf); 100 | 101 | char path[PATH_MAX]; 102 | cg_makepath(path, sizeof(path), attr); 103 | 104 | int fd = open(path, O_WRONLY | O_TRUNC); 105 | if (fd < 0) 106 | { 107 | if (maybe) 108 | goto fail; 109 | else 110 | die("Cannot write %s: %m", path); 111 | } 112 | 113 | int written = write(fd, buf, n); 114 | if (written < 0) 115 | { 116 | if (maybe) 117 | goto fail_close; 118 | else 119 | die("Cannot set %s to %s: %m", path, buf); 120 | } 121 | if (written != n) 122 | die("Short write to %s (%d out of %d bytes)", path, written, n); 123 | 124 | fail_close: 125 | close(fd); 126 | fail: 127 | va_end(args); 128 | } 129 | 130 | static FILE *cg_fopen(const char *attr) 131 | { 132 | char path[PATH_MAX]; 133 | cg_makepath(path, sizeof(path), attr); 134 | 135 | FILE *f = fopen(path, "r"); 136 | if (!f) 137 | die("Cannot open %s: %m", path); 138 | 139 | return f; 140 | } 141 | 142 | static void cg_fclose(FILE *f) 143 | { 144 | if (ferror(f)) 145 | die("Read error on cgroup attributes: %m"); 146 | fclose(f); 147 | } 148 | 149 | static int cg_fread_kv(FILE *f, char *key, char *val) 150 | { 151 | char line[CG_BUFSIZE]; 152 | 153 | if (!fgets(line, sizeof(line), f)) 154 | return 0; 155 | 156 | char *eol = strchr(line, '\n'); 157 | if (!eol) 158 | die("Non-terminated or too long line in cgroup key-value file"); 159 | *eol = 0; 160 | 161 | char *space = strchr(line, ' '); 162 | if (!space) 163 | die("Missing space in cgroup key-value file"); 164 | *space = 0; 165 | 166 | strcpy(key, line); 167 | strcpy(val, space + 1); 168 | return 1; 169 | } 170 | 171 | void 172 | cg_init(void) 173 | { 174 | if (!cg_enable) 175 | return; 176 | 177 | if (strlen(cf_cg_root) > 5 && !memcmp(cf_cg_root, "auto:", 5)) 178 | { 179 | char *filename = cf_cg_root + 5; 180 | FILE *f = fopen(filename, "r"); 181 | if (!f) 182 | die("Cannot open %s: %m", filename); 183 | 184 | char *line = NULL; 185 | size_t len; 186 | if (getline(&line, &len, f) < 0) 187 | die("Cannot read from %s: %m", filename); 188 | 189 | char *sep = strchr(line, '\n'); 190 | if (sep) 191 | *sep = 0; 192 | 193 | fclose(f); 194 | cf_cg_root = line; 195 | } 196 | 197 | if (!dir_exists(cf_cg_root)) 198 | die("Control group root %s does not exist", cf_cg_root); 199 | 200 | snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); 201 | 202 | msg("Using control group %s under parent %s\n", cg_name, cf_cg_root); 203 | } 204 | 205 | void 206 | cg_create(void) 207 | { 208 | if (!cg_enable) 209 | return; 210 | 211 | struct stat st; 212 | char path[PATH_MAX]; 213 | 214 | cg_makepath(path, sizeof(path), NULL); 215 | if (stat(path, &st) >= 0 || errno != ENOENT) 216 | { 217 | msg("Control group %s already exists, trying to empty it.\n", path); 218 | if (rmdir(path) < 0) 219 | die("Failed to reset control group %s: %m", path); 220 | } 221 | 222 | if (mkdir(path, 0777)) 223 | die("Failed to create control group %s: %m", path); 224 | } 225 | 226 | void 227 | cg_enter(void) 228 | { 229 | if (!cg_enable) 230 | return; 231 | 232 | msg("Entering control group %s\n", cg_name); 233 | 234 | cg_write("cgroup.procs", "%d\n", (int) getpid()); 235 | 236 | if (cg_memory_limit) 237 | { 238 | cg_write("memory.max", "%lld\n", (long long) cg_memory_limit << 10); 239 | cg_write("?memory.swap.max", "0\n"); 240 | } 241 | 242 | struct cf_per_box *cf = cf_current_box(); 243 | if (cf->cpus) 244 | cg_write("cpuset.cpus", "%s", cf->cpus); 245 | if (cf->mems) 246 | cg_write("cpuset.mems", "%s", cf->mems); 247 | } 248 | 249 | static int 250 | raw_get_run_time_ms(void) 251 | { 252 | FILE *f = cg_fopen("cpu.stat"); 253 | unsigned long long usec = 0; 254 | bool found_usage = false; 255 | 256 | char key[CG_BUFSIZE], val[CG_BUFSIZE]; 257 | while (cg_fread_kv(f, key, val)) 258 | { 259 | if (!strcmp(key, "usage_usec")) 260 | { 261 | usec = atoll(val); 262 | found_usage = true; 263 | } 264 | } 265 | 266 | cg_fclose(f); 267 | if (!found_usage) 268 | die("Missing usage_usec in cpu.stat"); 269 | 270 | return usec / 1000; 271 | } 272 | 273 | static int cg_time_offset; 274 | 275 | int 276 | cg_get_run_time_ms(void) 277 | { 278 | if (!cg_enable) 279 | return 0; 280 | 281 | return raw_get_run_time_ms() - cg_time_offset; 282 | } 283 | 284 | void 285 | cg_setup(void) 286 | { 287 | if (!cg_enable) 288 | return; 289 | 290 | /* 291 | * The box CG can be used by multiple invocations of "isolate --run", 292 | * but cpu.stat is cummulative and cannot be reset. So we subtract 293 | * the initial value of cpu.stat. 294 | */ 295 | cg_time_offset = raw_get_run_time_ms(); 296 | if (verbose > 1) 297 | msg("CG: Time offset = %d", cg_time_offset); 298 | } 299 | 300 | void 301 | cg_stats(void) 302 | { 303 | if (!cg_enable) 304 | return; 305 | 306 | char key[CG_BUFSIZE], val[CG_BUFSIZE]; 307 | 308 | unsigned long long mem=0; 309 | if (cg_read("?memory.peak", val)) 310 | mem = atoll(val); 311 | if (mem) 312 | meta_printf("cg-mem:%lld\n", mem >> 10); 313 | 314 | // OOM kill detection 315 | FILE *f = cg_fopen("memory.events"); 316 | while (cg_fread_kv(f, key, val)) 317 | { 318 | if (!strcmp(key, "oom_kill") && atoll(val)) 319 | { 320 | meta_printf("cg-oom-killed:1\n"); 321 | break; 322 | } 323 | } 324 | cg_fclose(f); 325 | } 326 | 327 | void 328 | cg_remove(void) 329 | { 330 | if (!cg_enable) 331 | return; 332 | 333 | char path[PATH_MAX]; 334 | cg_makepath(path, sizeof(path), NULL); 335 | 336 | if (dir_exists(path)) 337 | { 338 | msg("Removing control group\n"); 339 | 340 | cg_write("?cgroup.kill", "1\n"); 341 | 342 | if (rmdir(path) < 0) 343 | die("Cannot remove control group %s: %m", path); 344 | } 345 | } 346 | -------------------------------------------------------------------------------- /util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Process Isolator -- Utility Functions 3 | * 4 | * (c) 2012-2023 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #include "isolate.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | void * 24 | xmalloc(size_t size) 25 | { 26 | void *p = malloc(size); 27 | if (!p) 28 | die("Out of memory"); 29 | return p; 30 | } 31 | 32 | char * 33 | xstrdup(char *str) 34 | { 35 | char *p = strdup(str); 36 | if (!p) 37 | die("Out of memory"); 38 | return p; 39 | } 40 | 41 | char *xsprintf(const char *fmt, ...) 42 | { 43 | va_list args; 44 | va_start(args, fmt); 45 | 46 | char *out; 47 | int res = vasprintf(&out, fmt, args); 48 | if (res < 0) 49 | die("Out of memory"); 50 | 51 | va_end(args); 52 | return out; 53 | } 54 | 55 | void 56 | timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *result) 57 | { 58 | result->tv_sec = a->tv_sec - b->tv_sec; 59 | result->tv_nsec = a->tv_nsec - b->tv_nsec; 60 | 61 | if (result->tv_nsec < 0) 62 | { 63 | result->tv_sec -= 1; 64 | result->tv_nsec += 1000000000L; 65 | } 66 | } 67 | 68 | int 69 | dir_exists(char *path) 70 | { 71 | struct stat st; 72 | return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); 73 | } 74 | 75 | void 76 | make_dir(char *path) 77 | { 78 | char *sep = (path[0] == '/' ? path+1 : path); 79 | 80 | for (;;) 81 | { 82 | sep = strchr(sep, '/'); 83 | if (sep) 84 | *sep = 0; 85 | 86 | if (mkdir(path, 0777) < 0 && errno != EEXIST) 87 | die("Cannot create directory %s: %m", path); 88 | 89 | if (!sep) 90 | break; 91 | *sep++ = '/'; 92 | } 93 | 94 | // mkdir() above may have returned EEXIST even if the path was not 95 | // a directory. Ensure that it is. 96 | struct stat st; 97 | if (stat(path, &st) < 0) 98 | die("Cannot stat %s: %m", path); 99 | if (!S_ISDIR(st.st_mode)) 100 | die("Cannot create %s: already exists, but not a directory", path); 101 | } 102 | 103 | void make_dir_for(char *path) 104 | { 105 | char *copy = xstrdup(path); 106 | char *last_slash = strrchr(copy, '/'); 107 | if (last_slash) 108 | { 109 | *last_slash = 0; 110 | make_dir(copy); 111 | } 112 | free(copy); 113 | } 114 | 115 | /* 116 | * Once upon a time, we used nftw() for traversing directory trees. 117 | * It was simple, but unfortunately prone to symlink swapping attacks. 118 | * Using FTW_CHDIR would prevent the attacks, but it interacts badly with 119 | * FTW_DEPTH which we need when removing directory trees. See bug report at 120 | * https://sourceware.org/bugzilla/show_bug.cgi?id=28831. 121 | * 122 | * We therefore switched to our implementation based on using openat(), 123 | * fstatat() and similar functions. 124 | */ 125 | 126 | struct walk_context { 127 | // Current item 128 | int dir_fd; 129 | const char *name; 130 | bool is_dir; 131 | struct stat st; 132 | 133 | // Common for the whole walk 134 | dev_t root_dev; 135 | void (*callback)(struct walk_context *ctx); 136 | 137 | // Used by our callbacks 138 | uid_t chown_uid; 139 | gid_t chown_gid; 140 | bool keep_special_files; 141 | }; 142 | 143 | static void 144 | walktree_ctx(struct walk_context *ctx) 145 | { 146 | DIR *dir = fdopendir(ctx->dir_fd); 147 | if (!dir) 148 | die("fdopendir failed: %m"); 149 | 150 | struct dirent *de; 151 | while (de = readdir(dir)) 152 | { 153 | ctx->name = de->d_name; 154 | 155 | if (!strcmp(ctx->name, ".") || !strcmp(ctx->name, "..")) 156 | continue; 157 | 158 | if (fstatat(ctx->dir_fd, ctx->name, &ctx->st, AT_SYMLINK_NOFOLLOW) < 0) 159 | die("Cannot stat %s: %m", ctx->name); 160 | 161 | if (ctx->st.st_dev != ctx->root_dev) 162 | die("Unexpected mountpoint: %s", ctx->name); 163 | 164 | if (S_ISDIR(ctx->st.st_mode)) 165 | { 166 | struct walk_context subdir = *ctx; 167 | subdir.dir_fd = openat(ctx->dir_fd, ctx->name, O_RDONLY | O_DIRECTORY | O_NOFOLLOW); 168 | if (subdir.dir_fd < 0) 169 | die("Cannot open directory %s: %m", ctx->name); 170 | walktree_ctx(&subdir); 171 | ctx->is_dir = true; 172 | ctx->callback(ctx); 173 | } 174 | else 175 | { 176 | ctx->is_dir = false; 177 | ctx->callback(ctx); 178 | } 179 | } 180 | 181 | closedir(dir); 182 | } 183 | 184 | static void 185 | walktree(struct walk_context *ctx, const char *path, void (*callback)(struct walk_context *ctx)) 186 | { 187 | ctx->callback = callback; 188 | ctx->dir_fd = AT_FDCWD; 189 | ctx->name = path; 190 | 191 | struct walk_context top = *ctx; 192 | top.dir_fd = open(path, O_RDONLY | O_DIRECTORY); 193 | if (top.dir_fd < 0) 194 | die("Cannot open directory %s: %m", path); 195 | 196 | if (fstat(top.dir_fd, &ctx->st) < 0) 197 | die("Cannot stat %s: %m", path); 198 | assert(S_ISDIR(ctx->st.st_mode)); 199 | top.root_dev = ctx->st.st_dev; 200 | 201 | walktree_ctx(&top); 202 | 203 | ctx->is_dir = true; 204 | ctx->callback(ctx); 205 | } 206 | 207 | static void 208 | rmtree_helper(struct walk_context *ctx) 209 | { 210 | if (ctx->is_dir) 211 | { 212 | if (unlinkat(ctx->dir_fd, ctx->name, AT_REMOVEDIR) < 0) 213 | die("Cannot rmdir %s: %m", ctx->name); 214 | } 215 | else 216 | { 217 | if (unlinkat(ctx->dir_fd, ctx->name, 0) < 0) 218 | die("Cannot unlink %s: %m", ctx->name); 219 | } 220 | } 221 | 222 | void 223 | rmtree(char *path) 224 | { 225 | struct walk_context ctx = { }; 226 | walktree(&ctx, path, rmtree_helper); 227 | } 228 | 229 | static void 230 | chowntree_helper(struct walk_context *ctx) 231 | { 232 | if (S_ISREG(ctx->st.st_mode) || S_ISDIR(ctx->st.st_mode) || ctx->keep_special_files) 233 | { 234 | if (fchownat(ctx->dir_fd, ctx->name, ctx->chown_uid, ctx->chown_gid, AT_SYMLINK_NOFOLLOW) < 0) 235 | die("Cannot chown %s: %m", ctx->name); 236 | } 237 | else 238 | { 239 | if (unlinkat(ctx->dir_fd, ctx->name, 0) < 0) 240 | die("Cannot unlink special file %s: %m", ctx->name); 241 | } 242 | } 243 | 244 | void 245 | chowntree(char *path, uid_t uid, gid_t gid, bool keep_special_files) 246 | { 247 | struct walk_context ctx = { 248 | .chown_uid = uid, 249 | .chown_gid = gid, 250 | .keep_special_files = keep_special_files, 251 | }; 252 | walktree(&ctx, path, chowntree_helper); 253 | } 254 | 255 | static int fds_to_keep[4]; 256 | static int num_kept_fds; 257 | 258 | void 259 | keep_fd(int fd) 260 | { 261 | assert(num_kept_fds < ARRAY_SIZE(fds_to_keep)); 262 | fds_to_keep[num_kept_fds++] = fd; 263 | } 264 | 265 | static bool 266 | fd_is_kept(int fd) 267 | { 268 | for (int i=0; i < num_kept_fds; i++) 269 | if (fds_to_keep[i] == fd) 270 | return true; 271 | return false; 272 | } 273 | 274 | void 275 | close_all_fds(void) 276 | { 277 | /* Close all file descriptors except 0, 1, 2 */ 278 | 279 | DIR *dir = opendir("/proc/self/fd"); 280 | if (!dir) 281 | die("Cannot open /proc/self/fd: %m"); 282 | int dir_fd = dirfd(dir); 283 | 284 | struct dirent *e; 285 | while (e = readdir(dir)) 286 | { 287 | char *end; 288 | long int fd = strtol(e->d_name, &end, 10); 289 | if (*end) 290 | continue; 291 | if (fd >= 0 && fd <= 2 || fd == dir_fd || fd_is_kept(fd)) 292 | continue; 293 | close(fd); 294 | } 295 | 296 | closedir(dir); 297 | } 298 | 299 | /*** Meta-files ***/ 300 | 301 | static FILE *metafile; 302 | 303 | void 304 | meta_open(const char *name) 305 | { 306 | if (!strcmp(name, "-")) 307 | { 308 | metafile = stdout; 309 | return; 310 | } 311 | if (setfsuid(getuid()) < 0) 312 | die("Failed to switch FS UID: %m"); 313 | metafile = fopen(name, "w"); 314 | if (setfsuid(geteuid()) < 0) 315 | die("Failed to switch FS UID back: %m"); 316 | if (!metafile) 317 | die("Failed to open metafile '%s'",name); 318 | keep_fd(fileno(metafile)); 319 | } 320 | 321 | void 322 | meta_close(void) 323 | { 324 | if (metafile && metafile != stdout) 325 | fclose(metafile); 326 | } 327 | 328 | void 329 | meta_printf(const char *fmt, ...) 330 | { 331 | if (!metafile) 332 | return; 333 | 334 | va_list args; 335 | va_start(args, fmt); 336 | vfprintf(metafile, fmt, args); 337 | va_end(args); 338 | } 339 | -------------------------------------------------------------------------------- /isolate-check-environment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Identifies potential sources issues when using isolate. 4 | # 5 | # (c) 2017 Bernard Blackham 6 | # (c) 2022-2025 Martin Mares 7 | # (c) 2024 Stephan Gomer 8 | # 9 | 10 | usage() { 11 | cat <&2 12 | Usage: $0 [-q|--quiet] [-e|--execute] 13 | 14 | Use this script to identify sources of run-time variability and other issues on 15 | Linux machines which may affect isolate. 16 | 17 | See manual page for details. 18 | EOT 19 | exit 2 20 | } 21 | 22 | # Parse options. 23 | args=$(getopt -o "ehq" --long "execute,help,quiet" -- "$@") || usage 24 | eval set -- "$args" 25 | quiet= 26 | execute= 27 | while : ; do 28 | case "$1" in 29 | -q|--quiet) quiet=1 ; shift ;; 30 | -e|--execute) execute=1 ; shift ;; 31 | -h|--help) usage ;; 32 | --) shift ; break ;; 33 | *) usage ;; 34 | esac 35 | done 36 | [ -n "$*" ] && usage 37 | 38 | # Some helper boilerplate machinery. 39 | exit_status=0 40 | if [ -n "$TERM" -a "$TERM" != dumb ] ; then 41 | red=$(tput setaf 1) 42 | green=$(tput setaf 2) 43 | yellow=$(tput setaf 3) 44 | normal=$(tput sgr0) 45 | else 46 | red= 47 | green= 48 | yellow= 49 | normal= 50 | fi 51 | 52 | # Return true (0) if we are being quiet. 53 | quiet() { 54 | [ -n "$quiet" ] 55 | } 56 | 57 | # Print all arguments to stderr as warning. 58 | warn() { 59 | quiet || echo "${yellow}WARNING:${normal}" "$*" >&2 60 | } 61 | 62 | # Print first argument to stderr as warning, and second argument to stdout as 63 | # the recommended remedial action, or execute if --execute is given. 64 | action() { 65 | quiet || warn "$1" 66 | if [ -n "$execute" ] ; then 67 | quiet || echo "+ $2" 68 | sh -c "$2" 69 | else 70 | quiet || echo $2 71 | fi 72 | } 73 | 74 | print_start_check() { 75 | quiet && return 76 | print_check_status=1 77 | echo -n "Checking for $@ ... " >&2 78 | } 79 | 80 | print_fail() { 81 | exit_status=1 82 | quiet && return 83 | [ -n "$print_check_status" ] && echo "${red}FAIL${normal}" >&2 84 | print_check_status= 85 | } 86 | 87 | print_dubious() { 88 | exit_status=1 89 | quiet && return 90 | [ -n "$print_check_status" ] && echo "${yellow}CAUTION${normal}" >&2 91 | print_check_status= 92 | } 93 | 94 | print_skipped() { 95 | quiet && return 96 | [ -n "$print_check_status" ] && echo "SKIPPED (not detected)" >&2 97 | print_check_status= 98 | } 99 | 100 | print_finish() { 101 | quiet && return 102 | [ -n "$print_check_status" ] && echo "${green}PASS${normal}" >&2 103 | print_check_status= 104 | } 105 | 106 | # Check that cgroups are enabled. 107 | cgroup_check() { 108 | local cgroup=$1 109 | print_start_check "cgroup support for $cgroup" 110 | if ! test -f "$cg_root/$cgroup" ; then 111 | print_dubious 112 | warn "the $cgroup is not present. isolate --cg cannot be used." 113 | fi 114 | print_finish 115 | } 116 | 117 | # Check that cgroups are enabled. 118 | if ! cg_root=$(isolate --print-cg-root 2>/dev/null) ; then 119 | warn "cgroup root not found. isolate --cg cannot be used." 120 | exit_status=1 121 | else 122 | quiet || echo "Using cgroup root: $cg_root" 123 | cgroup_check cpuset.cpus 124 | cgroup_check cpuset.mems 125 | cgroup_check cpu.stat 126 | cgroup_check cgroup.procs 127 | cgroup_check memory.events 128 | cgroup_check memory.max 129 | fi 130 | 131 | # Check that swap is either disabled or accounted for. 132 | swap_check() { 133 | print_start_check "swap" 134 | # If swap is disabled, there is nothing to worry about. 135 | local swaps 136 | swaps=$(swapon --noheadings) 137 | if [ -n "$swaps" ] ; then 138 | # Swap is enabled. We had better have the memory.swap support in the memory cgroup. 139 | if ! test -f "$cg_root/memory.swap.current" ; then 140 | print_fail 141 | action \ 142 | "swap is enabled, but swap accounting is not. isolate will not be able to enforce memory limits." \ 143 | "swapoff -a" 144 | else 145 | print_dubious 146 | warn "swap is enabled, and although accounted for, may still give run-time variability under memory pressure." 147 | fi 148 | fi 149 | print_finish 150 | } 151 | swap_check 152 | 153 | # Check that SMT is disabled. 154 | smt_check() { 155 | print_start_check "simultaneous multithreading" 156 | local val 157 | if val="$(cat /sys/devices/system/cpu/smt/active 2>/dev/null)" ; then 158 | if [ "$val" -ne 0 ] ; then 159 | print_fail 160 | 161 | val="$(cat /sys/devices/system/cpu/smt/control)" 162 | if [ "$val" != "notimplemented" ] ; then 163 | action \ 164 | "simultaneous multithreading is enabled." \ 165 | "echo off > /sys/devices/system/cpu/smt/control" 166 | else 167 | warn "SMT is enabled, but runtime SMT toggling is not supported. Add 'nosmt=1' to the kernel command line." 168 | fi 169 | fi 170 | else 171 | print_skipped 172 | fi 173 | print_finish 174 | } 175 | smt_check 176 | 177 | # Check that CPU frequency scaling is disabled. 178 | cpufreq_check() { 179 | print_start_check "CPU frequency scaling" 180 | local anycpus policy 181 | anycpus= 182 | # Ensure cpufreq governor is set to performance on all CPUs 183 | for cpufreq_file in $(find /sys/devices/system/cpu/cpufreq/ -name scaling_governor) ; do 184 | if policy=$(cat $cpufreq_file 2>/dev/null) ; then 185 | if [ "$policy" != "performance" ] ; then 186 | print_fail 187 | action \ 188 | "cpufreq governor set to '$policy', but 'performance' would be better" \ 189 | "echo performance > $cpufreq_file" 190 | fi 191 | fi 192 | anycpus=1 193 | done 194 | [ -z "$anycpus" ] && print_skipped 195 | print_finish 196 | } 197 | cpufreq_check 198 | 199 | # Check that Intel frequency boost is disabled 200 | intel_boost_check() { 201 | print_start_check "Intel frequency boost" 202 | local val 203 | if val=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo 2>/dev/null) ; then 204 | if [ "$val" -ne 1 ] ; then 205 | print_fail 206 | action \ 207 | "frequency boosting is enabled." \ 208 | "echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo" 209 | fi 210 | else 211 | print_skipped 212 | fi 213 | print_finish 214 | } 215 | intel_boost_check 216 | 217 | # Check that general frequency boost is disabled 218 | general_boost_check() { 219 | print_start_check "general frequency boost" 220 | local val 221 | if val=$(cat /sys/devices/system/cpu/cpufreq/boost 2>/dev/null) ; then 222 | if [ "$val" -ne 0 ] ; then 223 | print_fail 224 | action \ 225 | "frequency boosting is enabled." \ 226 | "echo 0 > /sys/devices/system/cpu/cpufreq/boost" 227 | fi 228 | else 229 | print_skipped 230 | fi 231 | print_finish 232 | } 233 | general_boost_check 234 | 235 | # Check that address space layout randomisation is disabled. 236 | aslr_check() { 237 | print_start_check "kernel address space randomisation" 238 | local val 239 | if val=$(cat /proc/sys/kernel/randomize_va_space 2>/dev/null) ; then 240 | if [ "$val" -ne 0 ] ; then 241 | print_fail 242 | action \ 243 | "address space randomisation is enabled." \ 244 | "echo 0 > /proc/sys/kernel/randomize_va_space" 245 | fi 246 | else 247 | print_skipped 248 | fi 249 | print_finish 250 | } 251 | aslr_check 252 | 253 | # Check that transparent huge-pages are disabled, as this leads to 254 | # non-determinism depending on whether the kernel can allocate 2 MiB pages or 255 | # not. 256 | thp_check() { 257 | print_start_check "transparent hugepage support" 258 | local val 259 | if val=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null) ; then 260 | case $val in 261 | *'[never]'*) ;; 262 | *) print_fail 263 | action \ 264 | "transparent hugepages are enabled." \ 265 | "echo never > /sys/kernel/mm/transparent_hugepage/enabled" ;; 266 | esac 267 | fi 268 | if val=$(cat /sys/kernel/mm/transparent_hugepage/defrag 2>/dev/null) ; then 269 | case $val in 270 | *'[never]'*) ;; 271 | *) print_fail 272 | action \ 273 | "transparent hugepage defrag is enabled." \ 274 | "echo never > /sys/kernel/mm/transparent_hugepage/defrag" ;; 275 | esac 276 | fi 277 | if val=$(cat /sys/kernel/mm/transparent_hugepage/khugepaged/defrag 2>/dev/null) ; then 278 | if [ "$val" -ne 0 ] ; then 279 | print_fail 280 | action \ 281 | "khugepaged defrag is enabled." \ 282 | "echo 0 > /sys/kernel/mm/transparent_hugepage/khugepaged/defrag" 283 | fi 284 | fi 285 | print_finish 286 | } 287 | thp_check 288 | 289 | # Piping of core dumps to programs can make program crashes significantly 290 | # slower. Unfortunetely, dumps to pipes are not affected by RLIMIT_CORE, 291 | # so we cannot easily disable them inside the sandbox. 292 | core_check() { 293 | print_start_check "core file pattern" 294 | local val 295 | if val="$(cat /proc/sys/kernel/core_pattern)" ; then 296 | if [ "${val:0:1}" = '|' ] ; then 297 | print_fail 298 | action \ 299 | "core files are piped to a program." \ 300 | "echo core >/proc/sys/kernel/core_pattern" 301 | fi 302 | else 303 | print_skipped 304 | fi 305 | print_finish 306 | } 307 | core_check 308 | 309 | # Without protected_hardlinks, the user running Isolate could trick it into 310 | # changing ownership of unrelated files. 311 | hardlink_check() { 312 | print_start_check "hard link protection" 313 | local val 314 | if val="$(cat /proc/sys/fs/protected_hardlinks)" ; then 315 | if [ $val = 0 ] ; then 316 | print_fail 317 | action \ 318 | "hardlink protection is disabled." \ 319 | "echo 1 >/proc/sys/fs/protected_hardlinks" 320 | fi 321 | else 322 | print_skipped 323 | fi 324 | print_finish 325 | } 326 | hardlink_check 327 | 328 | # Check for an Intel CPU with both P-cores and E-cores. 329 | # At the moment, we have no automatic remedy. 330 | asymmetric_core_check() { 331 | print_start_check "asymmetric cores" 332 | if [ -d /sys/devices/cpu_atom -a -d /sys/devices/cpu_core ] ; then 333 | print_dubious 334 | quiet || warn "the CPU has a combination of P-cores and E-cores, core pinning should be used." 335 | fi 336 | print_finish 337 | } 338 | asymmetric_core_check 339 | 340 | 341 | exit $exit_status 342 | -------------------------------------------------------------------------------- /rules.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Process Isolator -- Rules 3 | * 4 | * (c) 2012-2025 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #include "isolate.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | /*** Environment rules ***/ 26 | 27 | struct env_rule { 28 | char *var; // Variable to match 29 | char *val; // ""=clear, NULL=inherit 30 | int var_len; 31 | struct env_rule *next; 32 | }; 33 | 34 | static struct env_rule *first_env_rule; 35 | static struct env_rule **last_env_rule = &first_env_rule; 36 | 37 | static struct env_rule default_env_rules[] = { 38 | { .var = "LIBC_FATAL_STDERR_", .val = "1", .var_len = 18 }, 39 | }; 40 | 41 | int 42 | set_env_action(char *a0) 43 | { 44 | struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1); 45 | char *a = (char *)(r+1); 46 | strcpy(a, a0); 47 | 48 | char *sep = strchr(a, '='); 49 | if (sep == a) 50 | return 0; 51 | r->var = a; 52 | if (sep) 53 | { 54 | *sep++ = 0; 55 | r->val = sep; 56 | } 57 | else 58 | r->val = NULL; 59 | *last_env_rule = r; 60 | last_env_rule = &r->next; 61 | r->next = NULL; 62 | return 1; 63 | } 64 | 65 | static int 66 | match_env_var(char *env_entry, struct env_rule *r) 67 | { 68 | if (strncmp(env_entry, r->var, r->var_len)) 69 | return 0; 70 | return (env_entry[r->var_len] == '='); 71 | } 72 | 73 | static void 74 | apply_env_rule(char **env, int *env_sizep, struct env_rule *r) 75 | { 76 | // First remove the variable if already set 77 | int pos = 0; 78 | while (pos < *env_sizep && !match_env_var(env[pos], r)) 79 | pos++; 80 | if (pos < *env_sizep) 81 | { 82 | (*env_sizep)--; 83 | env[pos] = env[*env_sizep]; 84 | env[*env_sizep] = NULL; 85 | } 86 | 87 | // What is the new value? 88 | char *new; 89 | if (r->val) 90 | { 91 | if (!r->val[0]) 92 | return; 93 | new = xmalloc(r->var_len + 1 + strlen(r->val) + 1); 94 | sprintf(new, "%s=%s", r->var, r->val); 95 | } 96 | else 97 | { 98 | pos = 0; 99 | while (environ[pos] && !match_env_var(environ[pos], r)) 100 | pos++; 101 | if (!(new = environ[pos])) 102 | return; 103 | } 104 | 105 | // Add it at the end of the array 106 | env[(*env_sizep)++] = new; 107 | env[*env_sizep] = NULL; 108 | } 109 | 110 | char ** 111 | setup_environment(void) 112 | { 113 | // Link built-in rules with user rules 114 | for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--) 115 | { 116 | default_env_rules[i].next = first_env_rule; 117 | first_env_rule = &default_env_rules[i]; 118 | } 119 | 120 | // Scan the original environment 121 | char **orig_env = environ; 122 | int orig_size = 0; 123 | while (orig_env[orig_size]) 124 | orig_size++; 125 | 126 | // For each rule, reserve one more slot and calculate length 127 | int num_rules = 0; 128 | for (struct env_rule *r = first_env_rule; r; r=r->next) 129 | { 130 | num_rules++; 131 | r->var_len = strlen(r->var); 132 | } 133 | 134 | // Create a new environment 135 | char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *)); 136 | int size; 137 | if (pass_environ) 138 | { 139 | memcpy(env, environ, orig_size * sizeof(char *)); 140 | size = orig_size; 141 | } 142 | else 143 | size = 0; 144 | env[size] = NULL; 145 | 146 | // Apply the rules one by one 147 | for (struct env_rule *r = first_env_rule; r; r=r->next) 148 | apply_env_rule(env, &size, r); 149 | 150 | // Return the new env and pass some gossip 151 | if (verbose > 1) 152 | { 153 | fprintf(stderr, "Passing environment:\n"); 154 | for (int i=0; env[i]; i++) 155 | fprintf(stderr, "\t%s\n", env[i]); 156 | } 157 | return env; 158 | } 159 | 160 | /*** Directory rules ***/ 161 | 162 | struct dir_rule { 163 | char *inside; // A relative path 164 | char *outside; // This can be an absolute path or a relative path starting with "./" 165 | unsigned int flags; // DIR_FLAG_xxx 166 | struct dir_rule *next; 167 | }; 168 | 169 | enum dir_rule_flags { 170 | DIR_FLAG_RW = 1, 171 | DIR_FLAG_NOEXEC = 2, 172 | DIR_FLAG_FS = 4, 173 | DIR_FLAG_MAYBE = 8, 174 | DIR_FLAG_DEV = 16, 175 | DIR_FLAG_TMP = 32, 176 | DIR_FLAG_NOREC = 64, 177 | DIR_FLAG_DEFAULT = 1U << 15, // Used internally 178 | DIR_FLAG_DISABLED = 1U << 16, // Used internally 179 | }; 180 | 181 | static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev", "tmp", "norec" }; 182 | 183 | static struct dir_rule *first_dir_rule; 184 | static struct dir_rule **last_dir_rule = &first_dir_rule; 185 | 186 | static char * 187 | sanitize_dir_path(char *path) 188 | { 189 | // Strip leading slashes 190 | while (*path == '/') 191 | path++; 192 | if (!*path) 193 | return NULL; 194 | 195 | // Check for ".." components 196 | char *p = path; 197 | while (*p) 198 | { 199 | char *next = strchr(p, '/'); 200 | if (!next) 201 | next = p + strlen(p); 202 | 203 | int len = next - p; 204 | if (len == 2 && !memcmp(p, "..", 2)) 205 | return NULL; 206 | 207 | p = *next ? next+1 : next; 208 | } 209 | 210 | return path; 211 | } 212 | 213 | static int 214 | add_dir_rule(char *in, char *out, unsigned int flags) 215 | { 216 | // Make sure that "in" does not try to escape the box 217 | in = sanitize_dir_path(in); 218 | if (!in) 219 | return 0; 220 | 221 | // Override an existing rule 222 | struct dir_rule *r; 223 | for (r = first_dir_rule; r; r = r->next) 224 | if (!strcmp(r->inside, in)) 225 | break; 226 | 227 | // Add a new rule 228 | if (!r) 229 | { 230 | r = xmalloc(sizeof(*r)); 231 | r->inside = in; 232 | *last_dir_rule = r; 233 | last_dir_rule = &r->next; 234 | r->next = NULL; 235 | } 236 | r->outside = out; 237 | r->flags = flags; 238 | return 1; 239 | } 240 | 241 | static unsigned int 242 | parse_dir_option(char *opt) 243 | { 244 | for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++) 245 | if (!strcmp(opt, dir_flag_names[i])) 246 | return 1U << i; 247 | die("Unknown directory option %s", opt); 248 | } 249 | 250 | static int 251 | set_dir_action_ext(char *arg, unsigned int ext_flags) 252 | { 253 | arg = xstrdup(arg); 254 | 255 | char *colon = strchr(arg, ':'); 256 | unsigned int flags = ext_flags; 257 | while (colon) 258 | { 259 | *colon++ = 0; 260 | char *next = strchr(colon, ':'); 261 | if (next) 262 | *next = 0; 263 | flags |= parse_dir_option(colon); 264 | colon = next; 265 | } 266 | 267 | char *eq = strchr(arg, '='); 268 | if (eq) 269 | *eq++ = 0; 270 | 271 | if ((flags & DIR_FLAG_FS) && (flags & DIR_FLAG_TMP)) 272 | return 0; 273 | 274 | if (flags & DIR_FLAG_FS) 275 | { 276 | if (!eq || strchr(eq, '/')) 277 | return 0; 278 | return add_dir_rule(arg, eq, flags); 279 | } 280 | else if (flags & DIR_FLAG_TMP) 281 | { 282 | if (eq) 283 | return 0; 284 | /* 285 | * Construct an outside temporary directory, which will be later 286 | * chowned to box_uid. The hierarchy of these directories is intentionally 287 | * flat, so that we avoid writing to a directory which might have already 288 | * tampered with in a previous run of the sandbox. 289 | */ 290 | char out[1024]; 291 | snprintf(out, sizeof(out), "./tmp/%s", arg); 292 | for (char *p = out + strlen("./tmp/"); *p; p++) 293 | if (*p == '/') 294 | *p = ':'; // This is safe, there were no colons in "out" 295 | return add_dir_rule(arg, xstrdup(out), flags | DIR_FLAG_RW); 296 | } 297 | else if (eq) 298 | { 299 | if (!eq[0]) 300 | return add_dir_rule(arg, NULL, flags); 301 | if (eq[0] != '/' && strncmp(eq, "./", 2)) 302 | return 0; 303 | return add_dir_rule(arg, eq, flags); 304 | } 305 | else 306 | { 307 | char *out = xmalloc(1 + strlen(arg) + 1); 308 | sprintf(out, "/%s", arg); 309 | return add_dir_rule(arg, out, flags); 310 | } 311 | } 312 | 313 | int 314 | set_dir_action(char *arg) 315 | { 316 | return set_dir_action_ext(arg, 0); 317 | } 318 | 319 | static int 320 | set_dir_action_default(char *arg) 321 | { 322 | return set_dir_action_ext(arg, DIR_FLAG_DEFAULT); 323 | } 324 | 325 | void 326 | init_dir_rules(void) 327 | { 328 | set_dir_action_default("box=./box:rw"); 329 | set_dir_action_default("bin"); 330 | set_dir_action_default("dev:dev"); 331 | set_dir_action_default("lib"); 332 | set_dir_action_default("lib64:maybe"); 333 | set_dir_action_default("proc=proc:fs"); 334 | set_dir_action_default("tmp:tmp"); 335 | set_dir_action_default("usr"); 336 | } 337 | 338 | static void 339 | set_cap_sys_admin(void) 340 | { 341 | cap_t caps; 342 | if (!(caps = cap_get_proc())) 343 | die("Cannot get capabilities: %m"); 344 | 345 | cap_value_t cap_list[] = { CAP_SYS_ADMIN }; 346 | if (cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_SET) < 0) 347 | die("Cannot modify capabilities"); 348 | 349 | if (cap_set_proc(caps) < 0) 350 | die("Cannot set capabilities: %m"); 351 | 352 | cap_free(caps); 353 | } 354 | 355 | void 356 | apply_dir_rules(int with_defaults) 357 | { 358 | /* 359 | * Before mounting anything, we create all mount points inside the box. 360 | * This is necessary to avoid bypassing directory permissions. If you 361 | * want nested binds, you have to create the mount points explicitly. 362 | */ 363 | for (struct dir_rule *r = first_dir_rule; r; r=r->next) 364 | { 365 | if (!with_defaults && (r->flags & DIR_FLAG_DEFAULT)) 366 | continue; 367 | 368 | char *in = r->inside; 369 | char *out = r->outside; 370 | 371 | if (!out) 372 | { 373 | msg("Not binding anything on %s\n", in); 374 | r->flags |= DIR_FLAG_DISABLED; 375 | continue; 376 | } 377 | 378 | if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out)) 379 | { 380 | msg("Not binding %s on %s (does not exist)\n", out, r->inside); 381 | r->flags |= DIR_FLAG_DISABLED; 382 | continue; 383 | } 384 | 385 | char root_in[1024]; 386 | snprintf(root_in, sizeof(root_in), "root/%s", in); 387 | make_dir(root_in); 388 | } 389 | 390 | for (struct dir_rule *r = first_dir_rule; r; r=r->next) 391 | { 392 | if (r->flags & DIR_FLAG_DISABLED) 393 | continue; 394 | if (!with_defaults && (r->flags & DIR_FLAG_DEFAULT)) 395 | continue; 396 | 397 | char *in = r->inside; 398 | char *out = r->outside; 399 | char root_in[1024]; 400 | snprintf(root_in, sizeof(root_in), "root/%s", in); 401 | 402 | if (r->flags & DIR_FLAG_TMP) 403 | { 404 | make_dir(out); 405 | if (chown(out, box_uid, box_gid) < 0) 406 | die("Cannot chown %s: %m", out); 407 | if (chmod(out, 0700) < 0) 408 | die("Cannot chmod %s: %m", out); 409 | } 410 | 411 | unsigned long mount_flags = 0; 412 | if (!(r->flags & DIR_FLAG_RW)) 413 | mount_flags |= MS_RDONLY; 414 | if (r->flags & DIR_FLAG_NOEXEC) 415 | mount_flags |= MS_NOEXEC; 416 | if (!(r->flags & DIR_FLAG_DEV)) 417 | mount_flags |= MS_NODEV; 418 | 419 | if (r->flags & DIR_FLAG_FS) 420 | { 421 | msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags); 422 | if (mount("none", root_in, out, mount_flags, "") < 0) 423 | die("Cannot mount %s on %s: %m", out, in); 424 | if (!strcmp(in, "proc")) 425 | { 426 | // If we are mounting procfs, add hidepid=2, so that only the processes 427 | // of the same user are visible. This has to be done as a remount. 428 | if (mount("none", root_in, out, MS_REMOUNT | mount_flags, "hidepid=2") < 0) 429 | die("Cannot re-mount proc with hidepid option: %m"); 430 | } 431 | } 432 | else 433 | { 434 | mount_flags |= MS_BIND | MS_NOSUID; 435 | if (!(r->flags & DIR_FLAG_NOREC)) 436 | mount_flags |= MS_REC; 437 | msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags); 438 | 439 | /* 440 | * This is tricky. We cannot run mount() with root privileges, since 441 | * it could be used to bypass access control if the mounted path 442 | * contains elements inaccessible to the user running isolate. 443 | * 444 | * We switch effective UID and GID back to the calling user (which clears 445 | * all capabilities, but keeps them in the permitted set) and then 446 | * enable CAP_SYS_ADMIN. So we have CAP_SYS_ADMIN (needed for mount), 447 | * but not CAP_DAC_OVERRIDE (which allows to bypass permission checks). 448 | */ 449 | 450 | if (setresuid(orig_uid, orig_uid, 0) < 0 || 451 | setresgid(orig_gid, orig_gid, 0) < 0) 452 | die("Cannot switch UID and GID: %m"); 453 | 454 | set_cap_sys_admin(); 455 | 456 | // Most mount flags need remount to work 457 | if (mount(out, root_in, "none", mount_flags, "") < 0 || 458 | mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0) 459 | die("Cannot mount %s on %s: %m", out, in); 460 | 461 | if (setresuid(orig_uid, 0, orig_uid) < 0 || 462 | setresgid(orig_gid, 0, orig_gid) < 0) 463 | die("Cannot switch UID and GID: %m"); 464 | } 465 | } 466 | } 467 | 468 | /*** Disk quotas ***/ 469 | 470 | static void 471 | quotactl_error(void) 472 | { 473 | // This errno has an outstandingly unhelpful message of "no such process". 474 | if (errno == ESRCH) 475 | die("Cannot set disk quota: quotas have not been enabled for this filesystem"); 476 | die("Cannot set disk quota: %m"); 477 | } 478 | 479 | void 480 | set_quota(void) 481 | { 482 | if (!block_quota) 483 | return; 484 | 485 | struct dqblk dq = { 486 | .dqb_bhardlimit = block_quota, 487 | .dqb_bsoftlimit = block_quota, 488 | .dqb_ihardlimit = inode_quota, 489 | .dqb_isoftlimit = inode_quota, 490 | .dqb_valid = QIF_LIMITS, 491 | }; 492 | void *dq_ptr = (void*)&dq; 493 | int quota_op = QCMD(Q_SETQUOTA, USRQUOTA); 494 | 495 | int cwd_fd = open(".", O_DIRECTORY | O_PATH); 496 | if (cwd_fd < 0) 497 | die("open: %m"); 498 | 499 | if (syscall(SYS_quotactl_fd, cwd_fd, quota_op, box_uid, dq_ptr) < 0) 500 | quotactl_error(); 501 | 502 | close(cwd_fd); 503 | 504 | msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); 505 | } 506 | -------------------------------------------------------------------------------- /isolate.1.txt: -------------------------------------------------------------------------------- 1 | ISOLATE(1) 2 | ========== 3 | 4 | NAME 5 | ---- 6 | isolate - Isolate a process using Linux Containers 7 | 8 | SYNOPSIS 9 | -------- 10 | *isolate* 'options' *--init* 11 | 12 | *isolate* 'options' *--run* +--+ 'program' 'arguments' 13 | 14 | *isolate* 'options' *--cleanup* 15 | 16 | DESCRIPTION 17 | ----------- 18 | Run 'program' within a sandbox, so that it cannot communicate with the 19 | outside world and its resource consumption is limited. This can be used 20 | for example in a programming contest to run untrusted programs submitted 21 | by contestants in a controlled environment. 22 | 23 | The sandbox is used in the following way: 24 | 25 | * Run *isolate --init*, which initializes the sandbox, creates its working directory and 26 | prints its name to the standard output. If the sandbox already existed, it 27 | is reset. 28 | 29 | * Populate the directory with the executable file of the program and its 30 | input files. 31 | 32 | * Call *isolate --run* to run the program. A single line describing the 33 | status of the program is written to the standard error stream. 34 | 35 | * Fetch the output of the program from the directory. 36 | 37 | * Run *isolate --cleanup* to remove temporary files. Does nothing if the sandbox 38 | was already cleaned up. 39 | 40 | Please note that by default, the program is not allowed to start multiple 41 | processes of threads. If you need that, turn on the control group mode 42 | (see below). 43 | 44 | BASIC OPTIONS 45 | ------------- 46 | *-b, --box-id=*'id':: 47 | When you run multiple sandboxes in parallel, you have to assign unique 48 | IDs to them by this option. See the discussion on UIDs in the INSTALLATION 49 | section. The ID defaults to 0. 50 | 51 | *-M, --meta=*'file':: 52 | Output meta-data on the execution of the program to a given file. 53 | See below for syntax of the meta-files. 54 | 55 | *-i, --stdin=*'file':: 56 | Redirect standard input from 'file'. The 'file' has to be accessible 57 | inside the sandbox (which means that the sandboxed program can manipulate 58 | it arbitrarily). If not specified, standard input is inherited from the 59 | parent process. 60 | 61 | *-o, --stdout=*'file':: 62 | Redirect standard output to 'file'. The 'file' has to be accessible 63 | inside the sandbox (which means that the sandboxed program can manipulate 64 | it arbitrarily). If not specified, standard output is inherited from the 65 | parent process and the sandbox manager does not write anything to it. 66 | 67 | *-r, --stderr=*'file':: 68 | Redirect standard error output to 'file'. The 'file' has to be accessible 69 | inside the sandbox (which means that the sandboxed program can manipulate 70 | it arbitrarily). If not specified, standard error output is inherited from the 71 | parent process. See also *--stderr-to-stdout*. 72 | 73 | *--stderr-to-stdout*:: 74 | Redirect standard error output to standard output. This is performed after 75 | the standard output is redirected by *--stdout*. Mutually exclusive with *--stderr*. 76 | 77 | *-c, --chdir=*'dir':: 78 | Change directory to 'dir' before executing the program. This path must be 79 | relative to the root of the sandbox. 80 | 81 | *-v, --verbose*:: 82 | Tell the sandbox manager to be verbose and report on what is going on. 83 | Using *-v* multiple times produces even more jabber. 84 | 85 | *-s, --silent*:: 86 | Tell the sandbox manager to keep silence. No status messages are printed 87 | to stderr except for fatal errors of the sandbox itself. The combination of 88 | *--verbose* and *--silent* has an undefined effect. 89 | 90 | *--wait*:: 91 | Multiple instances of Isolate cannot manage the same sandbox simultaneously. 92 | If you attempt to do that, the new instance refuses to run. With this option, 93 | the new instance waits for the other instance to finish. 94 | 95 | LIMITS 96 | ------ 97 | The following options can limit system resources consumed by the program. 98 | 99 | *-m, --mem=*'size':: 100 | Limit address space of the program to 'size' kilobytes. If more processes 101 | are allowed, this applies to each of them separately. If this limit is reached, 102 | further memory allocations fail (e.g., malloc returns NULL). 103 | 104 | *-t, --time=*'time':: 105 | Limit run time of the program to 'time' seconds. Fractional numbers are allowed. 106 | Time in which the OS assigns the processor to other tasks is not counted. 107 | If this limit is exceeded, the program is killed (after *--extra-time*, if set). 108 | 109 | *-w, --wall-time=*'time':: 110 | Limit wall-clock time to 'time' seconds. Fractional values are allowed. 111 | This clock measures the time from the start of the program to its exit, 112 | so it does not stop when the program has lost the CPU or when it is waiting 113 | for an external event. We recommend to use *--time* as the main limit, 114 | but set *--wall-time* to a much higher value as a precaution against 115 | sleeping programs. 116 | If this limit is exceeded, the program is killed. 117 | 118 | *-x, --extra-time=*'time':: 119 | When the *--time* limit is exceeded, do not kill the program immediately, 120 | but wait until *--extra-time* seconds elapse since the start of the program. 121 | This allows one to report the real execution time, even if it exceeds the limit 122 | slightly. Fractional numbers are allowed. 123 | 124 | *-k, --stack=*'size':: 125 | Limit process stack to 'size' kilobytes. By default, the whole address 126 | space is available for the stack, but it is subject to the *--mem* limit. 127 | If this limit is exceeded, the program receives the SIGSEGV signal. 128 | 129 | *-n, --open-files=*'max':: 130 | Limit number of open files to 'max'. The default value is 64. Setting this 131 | option to 0 will result in unlimited open files. 132 | If this limit is reached, system calls creating file descriptors fail 133 | with error EMFILE. 134 | 135 | *-f, --fsize=*'size':: 136 | Limit size of each file created (or modified) by the program to 'size' kilobytes. 137 | In most cases, it is better to restrict overall disk usage by a disk quota 138 | (see below). This option can help in cases when quotas are not enabled 139 | on the underlying filesystem. 140 | If this limit is reached, system calls expanding files fail with error 141 | EFBIG and the program receives the SIGXFSZ signal. 142 | 143 | *-q, --quota=*'blocks'*,*'inodes':: 144 | Set disk quota to a given number of blocks and inodes. This requires the 145 | filesystem to be mounted with support for quotas. Unlike other options, 146 | this one must be given to *isolate --init*. Please note that this 147 | currently works only on the ext family of filesystems (other filesystems 148 | use other interfaces for setting quotas). 149 | If the quota is reached, system calls expanding files fail with error EDQUOT. 150 | 151 | *--core=*'size':: 152 | Limit size of core files created when a process crashes to 'size' kilobytes. 153 | Defaults to zero, meaning that no core files are produced inside the sandbox. 154 | 155 | *-p, --processes*[*=*'max']:: 156 | Permit the program to create up to 'max' processes and/or threads. Please 157 | keep in mind that time and memory limit do not work with multiple processes 158 | unless you enable the control group mode. If 'max' is not given, an arbitrary 159 | number of processes can be run. By default, only one process is permitted. 160 | If this limit is exceeded, system calls creating processes fail with error 161 | EAGAIN. 162 | 163 | ENVIRONMENT RULES 164 | ----------------- 165 | UNIX processes normally inherit all environment variables from their parent. The 166 | sandbox however passes only those variables which are explicitly requested by 167 | environment rules: 168 | 169 | *-E, --env=*'var':: 170 | Inherit the variable 'var' from the parent. 171 | 172 | *-E, --env=*'var'*=*'value':: 173 | Set the variable 'var' to 'value'. When the 'value' is empty, the 174 | variable is removed from the environment. 175 | 176 | *-e, --full-env*:: 177 | Inherit all variables from the parent. 178 | 179 | The rules are applied in the order in which they were given, except for 180 | *--full-env*, which is applied first. 181 | 182 | The list of rules is automatically initialized with *-ELIBC_FATAL_STDERR_=1*. 183 | 184 | DIRECTORY RULES 185 | --------------- 186 | The sandboxed process gets its own filesystem namespace, which contains only subtrees 187 | requested by directory rules: 188 | 189 | *-d, --dir=*'in'*=*'out'[*:*'options']:: 190 | Bind the directory 'out' as seen by the caller to the path 'in' inside the sandbox. 191 | If there already was a directory rule for 'in', it is replaced. 192 | 193 | *-d, --dir=*'dir'[*:*'options']:: 194 | Bind the directory +/+'dir' to 'dir' inside the sandbox. 195 | If there already was a directory rule for 'in', it is replaced. 196 | 197 | *-d, --dir=*'in'*=*:: 198 | Remove a directory rule for the path 'in' inside the sandbox. 199 | 200 | By default, all directories are bound read-only and restricted (no devices, 201 | no setuid binaries). This behavior can be modified using the 'options': 202 | 203 | *rw*:: 204 | Allow read-write access. 205 | 206 | *dev*:: 207 | Allow access to character and block devices. 208 | 209 | *noexec*:: 210 | Disallow execution of binaries. 211 | 212 | *maybe*:: 213 | Silently ignore the rule if the directory to be bound does not exist. 214 | 215 | *fs*:: 216 | Instead of binding a directory, mount a device-less filesystem called 'in'. 217 | For example, this can be 'proc' or 'sysfs'. 218 | 219 | *tmp*:: 220 | Bind a freshly created temporary directory writeable for the sandbox user. 221 | Accepts no 'out', implies *rw*. 222 | 223 | *norec*:: 224 | Do not bind recursively. Without this option, mount points in the outside 225 | directory tree are automatically propagated to the sandbox. 226 | 227 | Unless *--no-default-dirs* is specified, the default set of directory rules binds +/bin+, 228 | +/dev+ (with devices allowed), +/lib+, +/lib64+ (if it exists), and +/usr+. It also binds 229 | the working directory to +/box+ (read-write), mounts the proc filesystem at +/proc+, and 230 | creates a temporary directory +/tmp+. 231 | 232 | *-D, --no-default-dirs*:: 233 | Do not bind the default set of directories. Care has to be taken to specify 234 | the correct set of rules (using *--dir*) for the executed program to run 235 | correctly. In particular, +/box+ has to be bound. 236 | 237 | The rules are executed in the order in which they are given. Default rules come before 238 | all user rules. When a rule is replaced, it retains the original position 239 | in the order. This matters when one rule's 'in' is a sub-directory of another 240 | rule's 'in'. For example if you first bind to 'a' and then to 'a/b', it will work as 241 | expected, but a sub-directory 'b' must have existed in the directory bound to 'a' (isolate 242 | never creates subdirectories in bound directories for security reasons). If the 243 | order is 'a/b' before 'a', then the directory bound to 'a/b' becomes invisible 244 | by the later binding on 'a'. 245 | 246 | CONTROL GROUPS 247 | -------------- 248 | Isolate can make use of system control groups provided by the kernel 249 | to constrain programs consisting of multiple processes. Please note 250 | that this feature needs special system setup described in the INSTALLATION 251 | section. 252 | 253 | *--cg*:: 254 | Enable use of control groups. This should be specified with *--init*, 255 | *--run* and *--cleanup*. 256 | 257 | *--cg-mem=*'size':: 258 | Limit total memory usage by the whole control group to 'size' kilobytes. 259 | This should be specified with *--run*. 260 | Effect of reaching this limit depends on circumstances. 261 | If it happens during memory allocation, the allocation can fail or memory 262 | can be over-committed by the kernel. 263 | If it happens when handling a page fault, the whole process is killed 264 | by the OOM killer with the SIGSEGV signal. 265 | 266 | *--print-cg-root*:: 267 | Print the root of the control group hierarchy in */sys/* and exit. 268 | This is used by the *isolate-check-environment* script. 269 | 270 | SPECIAL OPTIONS 271 | --------------- 272 | The following options can be useful in special cases. 273 | 274 | *--share-net*:: 275 | By default, isolate creates a new network namespace for its child process. 276 | This namespace contains no network devices except for a per-namespace loopback. 277 | This prevents the program from communicating with the outside world. If you want 278 | to permit communication, you can use this switch to keep the child process 279 | in parent's network namespace. 280 | 281 | *--inherit-fds*:: 282 | By default, isolate closes all file descriptors passed from its parent 283 | except for descriptors 0, 1, and 2. 284 | This prevents unintentional descriptor leaks. In some cases, passing extra 285 | descriptors to the sandbox can be desirable, so you can use this switch 286 | to make them survive. 287 | 288 | *--tty-hack*:: 289 | Try to handle interactive programs communicating over a tty. 290 | The sandboxed program will run in a separate process group, which will temporarily 291 | become the foreground process group of the terminal. When the program exits, the 292 | process group will be switched back to the caller. Please note that the program 293 | can do many nasty things including (but not limited to) changing terminal settings, 294 | changing the line discipline, and stuffing characters to the terminal's input queue 295 | using the TIOCSTI ioctl. Use with extreme caution. 296 | 297 | *--special-files*:: 298 | By default, Isolate removes all special files (other than regular files 299 | and directories) created inside the sandbox. If you need them, this option disables 300 | that behavior, but you need to carefully check what you open. 301 | 302 | *--as-uid=*'uid', *--as-gid=*'gid':: 303 | Act on behalf of the specified user and group (only if Isolate was invoked by root). 304 | This is used in scenarios where a root-controlled process manages creation of sandboxes 305 | for regular users, usually in conjunction with the *restricted_init* option in 306 | the configuration file. 307 | 308 | META-FILES 309 | ---------- 310 | The meta-file contains miscellaneous meta-information on execution of the 311 | program within the sandbox. It is a textual file consisting of lines 312 | of format 'key'*:*'value'. The following keys are defined: 313 | 314 | *cg-mem*:: 315 | When control groups are enabled, this is the total memory use 316 | by the whole control group (in kilobytes). If you use *isolate --run* 317 | multiple times in the same sandbox, the control group retains cached 318 | data from the previous runs, which also contributes to *cg-mem*. 319 | *cg-oom-killed*:: 320 | Present when the program was killed by the out-of-memory killer 321 | (e.g., because it has exceeded the memory limit of its control group). 322 | This is reported only on Linux 4.13 and later. 323 | *csw-forced*:: 324 | Number of context switches forced by the kernel. 325 | *csw-voluntary*:: 326 | Number of context switches caused by the process giving up the CPU 327 | voluntarily. 328 | *exitcode*:: 329 | The program has exited normally with this exit code. 330 | *exitsig*:: 331 | The program has exited after receiving this fatal signal. 332 | *killed*:: 333 | Present when the program was terminated by the sandbox 334 | (e.g., because it has exceeded the time limit). 335 | *max-rss*:: 336 | Maximum resident set size of the process (in kilobytes). 337 | *message*:: 338 | Status message, not intended for machine processing. 339 | E.g., "Time limit exceeded." 340 | *status*:: 341 | Two-letter status code: 342 | * *RE* -- run-time error, i.e., exited with a non-zero exit code 343 | * *SG* -- program died on a signal 344 | * *TO* -- timed out 345 | * *XX* -- internal error of the sandbox 346 | *time*:: 347 | Run time of the program in fractional seconds. 348 | *time-wall*:: 349 | Wall clock time of the program in fractional seconds. 350 | 351 | Please note that not all keys have to be present. 352 | For example, no *status* nor *message* is reported upon normal termination. 353 | 354 | RETURN VALUE 355 | ------------ 356 | When the program inside the sandbox finishes correctly, the sandbox returns 0. 357 | If it finishes incorrectly, it returns 1. 358 | All other return codes signal an internal error. 359 | 360 | INSTALLATION 361 | ------------ 362 | Isolate depends on several advanced features of the Linux kernel, like different 363 | kinds of namespaces and control groups. These features are available in kernels 364 | of most Linux distributions now, but if you are building your own kernel, you 365 | have to be careful. 366 | 367 | Isolate is designed to run setuid to root. The sub-process inside the sandbox 368 | then switches to a non-privileged user ID (different for each *--box-id*). 369 | The range of UIDs available and several filesystem paths are set in a configuration 370 | file, by default located in /usr/local/etc/isolate. 371 | 372 | For control group mode: 373 | 374 | - Linux supports two incompatible implementations of control groups: cgroup v1 and v2. 375 | This version of Isolate requires v2, which is the default on recent systems. 376 | 377 | - If you are using systemd, you need to start the `isolate.service` (see service files 378 | in the `systemd` directory in Isolate's source tree). It establishes 379 | `isolate.scope` whose cgroup subtree is delegated to Isolate by systemd. 380 | The service runs a simple daemon *isolate-cg-keeper*(8) to keep the scope alive. 381 | 382 | - If you are not using systemd, make sure that Isolate's configuration file 383 | refers to the correct location where you have the cgroup filesystem mounted. 384 | Also make sure that whatever service manager you are using, it does not 385 | interfere with Isolate's use of control groups. 386 | 387 | - Running Isolate in containers is not recommended, since container managers 388 | usually do not delegate control groups properly. Besides, you do not want 389 | to share the machine with other workloads, which would influence measurement 390 | of execution time. If you still want to use containers, you are on your own 391 | and you probably have to make them privileged. 392 | 393 | - Reporting memory usage requires Linux kernel 5.19 or newer. 394 | 395 | - Since memory limits do not affect swapped-out data, we recommend turning off 396 | swap completely. 397 | 398 | Isolate expects that the root directory "/" is a mount point. When running 399 | isolate inside a chroot, this may not be the case, and isolate may fail with 400 | "Cannot privatize mounts". A workaround for this is to convert the root 401 | directory of the chroot into a mount point using a bind mount, prior to 402 | entering the chroot and running isolate. For example: 403 | 404 | mount --bind /path/to/chroot /path/to/chroot 405 | 406 | It is recommended to have +sys.fs.protected_hardlinks+ sysctl set to 1 407 | (which is probably default on modern Linux systems). Otherwise, the user running 408 | the sandbox could trick isolate to changing the owner of unrelated files. 409 | 410 | If you have systemd-coredump installed, please keep in mind that it records core 411 | files even for processes inside the sandbox. As it configures the kernel to deliver 412 | core dumps using a pipe, it is not affected by the *--core* limit. 413 | 414 | REPRODUCIBILITY 415 | --------------- 416 | The reproducibility of results can be improved by tuning some kernel 417 | parameters, listed below. Some of these parameters can be checked using the 418 | program isolate-check-environment. 419 | 420 | * Disable address space randomization: +sysctl kernel.randomize_va_space=0+. 421 | Address space randomization can affect timing, memory usage, and program 422 | behavior. This setting can be made persistent through /etc/sysctl.d/. 423 | 424 | * Disable dynamic CPU frequency scaling. This is done by setting the cpufreq 425 | scaling governor in /sys/device/system/cpu/cpufreq/*/scaling_governor to +performance+. 426 | (On Intel CPUs, frequency scaling can be controlled by the `intel_pstate` driver, 427 | but it still provides its own +performance+ controller to the cpufreq subsystem.) 428 | 429 | * Consider disabling frequency boosting on CPUs that might support it (this 430 | includes most i3/i5/i7 Intel CPUs and the AMD Zen architecture). This is done 431 | either by writing 1 to /sys/devices/system/cpu/intel_pstate/no_turbo (on Intel CPUs) 432 | or by writing 0 to /sys/devices/system/cpu/cpufreq/boost (other machines). 433 | 434 | * Run evaluations on a single CPU (core). The Linux scheduler has a tendency to randomly 435 | migrate tasks between CPUs, incurring cache migration costs. You can use isolate's 436 | configuration file to pin the process to a specified CPU. 437 | 438 | * If you have CPU with a mix of different cores (e.g., P-cores and E-cores in certain Intel CPUs), 439 | pin the sandbox to a homogeneous subset of cores. 440 | 441 | * Disable automatic kernel support for transparent huge pages. Both /sys/kernel/mm/transparent_hugepage/enabled 442 | and /sys/kernel/mm/transparent_hugepage/defrag should be set to "madvise" or "never", and 443 | /sys/kernel/mm/transparent_hugepage/khugepaged/defrag to 0. 444 | 445 | * Disable swapping. If you really need swap space and you are using cgroups, 446 | make sure that you have the memsw controller enabled, so that swap space is 447 | properly accounted for. 448 | 449 | See further suggestions in the https://ioi.github.io/checklist/[IOI Technical Checklist]. 450 | 451 | LICENSE 452 | ------- 453 | Isolate was written by Martin Mares and Bernard Blackham. 454 | It can be distributed and used under the terms of the GNU 455 | General Public License version 2 or any later version. 456 | 457 | SEE ALSO 458 | -------- 459 | *isolate-check-environment*(8), *isolate-cg-keeper*(8) 460 | -------------------------------------------------------------------------------- /isolate.c: -------------------------------------------------------------------------------- 1 | /* 2 | * A Process Isolator based on Linux Containers 3 | * 4 | * (c) 2012-2024 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #include "isolate.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | /* May not be defined in older glibc headers */ 34 | #ifndef MS_PRIVATE 35 | #warning "Working around old glibc: no MS_PRIVATE" 36 | #define MS_PRIVATE (1 << 18) 37 | #endif 38 | #ifndef MS_REC 39 | #warning "Working around old glibc: no MS_REC" 40 | #define MS_REC (1 << 14) 41 | #endif 42 | 43 | /* 44 | * Theory of operation 45 | * 46 | * Generally, we want to run a process inside a namespace/cgroup and watch it 47 | * from the outside. However, the reality is a little bit more complicated as we 48 | * do not want the inside process to become the init process of the PID namespace 49 | * (we want to have all signals properly delivered). 50 | * 51 | * We are running three processes: 52 | * 53 | * - Keeper process (root privileges, parent namespace, parent cgroups) 54 | * - Proxy process (UID/GID of the calling user, init process of the child 55 | * namespace, parent cgroups) 56 | * - Inside process (per-box UID/GID, child namespace, child cgroups) 57 | * 58 | * The proxy process just waits for the inside process to exit and then it passes 59 | * the exit status to the keeper. 60 | * 61 | * We use two pipes: 62 | * 63 | * - Error pipe for error messages produced by the proxy process and the early 64 | * stages of the inside process (until exec()). Listened to by the keeper. 65 | * - Status pipe for passing the PID of the inside process and its exit status 66 | * from the proxy to the keeper. 67 | */ 68 | 69 | #define TIMER_INTERVAL_US 100000 70 | 71 | static int timeout; /* milliseconds */ 72 | static int wall_timeout; 73 | static int extra_timeout; 74 | int pass_environ; 75 | int verbose; 76 | static int silent; 77 | static int fsize_limit; 78 | static int memory_limit; 79 | static int stack_limit; 80 | static int open_file_limit = 64; 81 | static int core_limit; 82 | int block_quota; 83 | int inode_quota; 84 | static int max_processes = 1; 85 | static char *redir_stdin, *redir_stdout, *redir_stderr; 86 | static int redir_stderr_to_stdout; 87 | static char *set_cwd; 88 | static int share_net; 89 | static int inherit_fds; 90 | static int default_dirs = 1; 91 | static int tty_hack; 92 | static bool special_files; 93 | static bool wait_if_busy; 94 | static int as_uid = -1; 95 | static int as_gid = -1; 96 | 97 | int cg_enable; 98 | int cg_memory_limit; 99 | 100 | int box_id; 101 | static char box_dir[1024]; 102 | static pid_t box_pid; 103 | static pid_t proxy_pid; 104 | 105 | uid_t box_uid; 106 | gid_t box_gid; 107 | uid_t orig_uid; 108 | gid_t orig_gid; 109 | static bool invoked_by_root; 110 | 111 | static int partial_line; 112 | static int cleanup_ownership; 113 | 114 | static struct timespec start_time; 115 | static int ticks_per_sec; 116 | static int total_ms, wall_ms; 117 | static volatile sig_atomic_t timer_tick, interrupt; 118 | 119 | static int error_pipes[2]; 120 | static int write_errors_to_fd; 121 | static int read_errors_from_fd; 122 | 123 | static int status_pipes[2]; 124 | 125 | static int get_wall_time_ms(void); 126 | static int get_run_time_ms(struct rusage *rus); 127 | 128 | /*** Locks ***/ 129 | 130 | /* 131 | * Whenever a sandbox is initialized, a lock file is created, which 132 | * records which user owns the sandbox and whether the cgroup mode is used. 133 | * Atempts to use the same sandbox by a different user are refused. 134 | * 135 | * The lock file is locked whenever Isolate runs in that sandbox. 136 | */ 137 | 138 | #define LOCK_MAGIC 0x48736f6c 139 | 140 | struct lock_record { 141 | uint32_t magic; 142 | uint32_t owner_uid; 143 | unsigned char cg_enabled; 144 | unsigned char is_initialized; 145 | unsigned char rfu[2]; 146 | }; 147 | 148 | static int lock_fd = -1; 149 | static struct lock_record lock; 150 | 151 | static void 152 | lock_write(void) 153 | { 154 | int n = pwrite(lock_fd, &lock, sizeof(lock), 0); 155 | if (n != sizeof(lock)) 156 | die("Cannot write lock file: %m"); 157 | } 158 | 159 | static bool 160 | lock_box(bool is_init) 161 | { 162 | if (!dir_exists(cf_lock_root)) 163 | make_dir(cf_lock_root); 164 | 165 | char lock_name[256]; 166 | int name_len = snprintf(lock_name, sizeof(lock_name), "%s/%d", cf_lock_root, box_id); 167 | assert(name_len < (int) sizeof(lock_name)); 168 | 169 | lock_fd = open(lock_name, O_RDWR | (is_init ? O_CREAT : 0), 0666); 170 | if (lock_fd < 0) 171 | { 172 | if (errno == ENOENT) 173 | return false; 174 | die("Cannot open %s: %m", lock_name); 175 | } 176 | 177 | if (flock(lock_fd, LOCK_EX | (wait_if_busy ? 0 : LOCK_NB)) < 0) 178 | { 179 | if (errno == EWOULDBLOCK) 180 | die("This box is currently in use by another process"); 181 | die("Cannot lock %s: %m", lock_name); 182 | } 183 | 184 | int n = read(lock_fd, &lock, sizeof(lock)); 185 | if (n < 0) 186 | die("Cannot read %s: %m", lock_name); 187 | 188 | if (n > 0) 189 | { 190 | if (n != sizeof(lock) || lock.magic != LOCK_MAGIC) 191 | die("Lock file %s has incompatible format", lock_name); 192 | if (lock.is_initialized && lock.owner_uid != orig_uid && !invoked_by_root) 193 | die("This box belongs to a different user (uid %d)", lock.owner_uid); 194 | if (lock.cg_enabled != cg_enable) 195 | die("This box was initialized with an incompatible control group mode"); 196 | } 197 | 198 | if (is_init) 199 | { 200 | lock.magic = LOCK_MAGIC; 201 | lock.owner_uid = orig_uid; 202 | lock.cg_enabled = cg_enable; 203 | lock.is_initialized = 0; 204 | lock_write(); 205 | return true; 206 | } 207 | else 208 | { 209 | if (n > 0) 210 | { 211 | if (!lock.is_initialized) 212 | die("This box was not initialized properly"); 213 | return true; 214 | } 215 | else 216 | { 217 | // This means that somebody else is just creating the sandbox and we locked it 218 | // between his creation of the lock file and locking it. 219 | return false; 220 | } 221 | } 222 | 223 | // The acquired lock will be automatically released on process exit. 224 | } 225 | 226 | static void 227 | lock_close(void) 228 | { 229 | if (lock_fd >= 0) 230 | { 231 | close(lock_fd); 232 | lock_fd = -1; 233 | } 234 | } 235 | 236 | static void 237 | lock_remove(void) 238 | { 239 | // To avoid race conditions, we must never unlink lock files. 240 | // We just truncate them to zero length. 241 | assert(lock_fd >= 0); 242 | if (ftruncate(lock_fd, 0) < 0) 243 | die("Cannot truncate lock file: %m"); 244 | close(lock_fd); 245 | lock_fd = -1; 246 | } 247 | 248 | /*** Messages and exits ***/ 249 | 250 | static void 251 | final_stats(struct rusage *rus) 252 | { 253 | total_ms = get_run_time_ms(rus); 254 | wall_ms = get_wall_time_ms(); 255 | 256 | meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000); 257 | meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000); 258 | meta_printf("max-rss:%ld\n", rus->ru_maxrss); 259 | meta_printf("csw-voluntary:%ld\n", rus->ru_nvcsw); 260 | meta_printf("csw-forced:%ld\n", rus->ru_nivcsw); 261 | 262 | cg_stats(); 263 | } 264 | 265 | static void NONRET 266 | box_exit(int rc) 267 | { 268 | if (proxy_pid > 0) 269 | { 270 | if (box_pid > 0) 271 | { 272 | kill(-box_pid, SIGKILL); 273 | kill(box_pid, SIGKILL); 274 | } 275 | if (cg_enable) 276 | { 277 | /* 278 | * In non-CG mode, we must not kill the proxy explicitly. 279 | * This is important, because the proxy could exit before the box 280 | * completes its exit, causing rusage of the box to be lost. 281 | * 282 | * In CG mode, we must kill the proxy, because it is the init 283 | * process of the CG and killing it causes all other processes 284 | * inside the CG to be killed. However, we do not care about 285 | * rusage. 286 | */ 287 | kill(-proxy_pid, SIGKILL); 288 | kill(proxy_pid, SIGKILL); 289 | } 290 | meta_printf("killed:1\n"); 291 | 292 | /* 293 | * The rusage will contain time spent by the proxy and its children (i.e., the box). 294 | * (See comments on killing of the proxy above, though.) 295 | */ 296 | struct rusage rus; 297 | int p, stat; 298 | do 299 | p = wait4(proxy_pid, &stat, 0, &rus); 300 | while (p < 0 && errno == EINTR); 301 | if (p < 0) 302 | fprintf(stderr, "UGH: Lost track of the process (%m)\n"); 303 | else 304 | final_stats(&rus); 305 | } 306 | 307 | if (tty_hack && isatty(1)) 308 | { 309 | /* 310 | * If stdout is a tty, make us the foreground process group again. 311 | * We do not need it (we ignore SIGTTOU anyway), but programs executed 312 | * after our exit will. 313 | */ 314 | tcsetpgrp(1, getpgrp()); 315 | } 316 | 317 | if (rc < 2 && cleanup_ownership) 318 | chowntree("box", orig_uid, orig_gid, special_files); 319 | 320 | meta_close(); 321 | exit(rc); 322 | } 323 | 324 | static void 325 | flush_line(void) 326 | { 327 | if (partial_line) 328 | fputc('\n', stderr); 329 | partial_line = 0; 330 | } 331 | 332 | /* Report an error of the sandbox itself */ 333 | void NONRET __attribute__((format(printf,1,2))) 334 | die(char *msg, ...) 335 | { 336 | va_list args; 337 | va_start(args, msg); 338 | char buf[1024]; 339 | int n = vsnprintf(buf, sizeof(buf), msg, args); 340 | 341 | // If the child processes are still running, show no mercy. 342 | if (box_pid > 0) 343 | { 344 | kill(-box_pid, SIGKILL); 345 | kill(box_pid, SIGKILL); 346 | } 347 | if (proxy_pid > 0) 348 | { 349 | kill(-proxy_pid, SIGKILL); 350 | kill(proxy_pid, SIGKILL); 351 | } 352 | 353 | if (write_errors_to_fd) 354 | { 355 | // We are inside the box, have to use error pipe for error reporting. 356 | // We hope that the whole error message fits in PIPE_BUF bytes. 357 | write(write_errors_to_fd, buf, n); 358 | exit(2); 359 | } 360 | 361 | // Otherwise, we in the box keeper process, so we report errors normally 362 | flush_line(); 363 | meta_printf("status:XX\nmessage:%s\n", buf); 364 | fputs(buf, stderr); 365 | fputc('\n', stderr); 366 | box_exit(2); 367 | } 368 | 369 | /* Report an error of the program inside the sandbox */ 370 | void NONRET __attribute__((format(printf,1,2))) 371 | err(char *msg, ...) 372 | { 373 | va_list args; 374 | va_start(args, msg); 375 | flush_line(); 376 | if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ') 377 | { 378 | meta_printf("status:%c%c\n", msg[0], msg[1]); 379 | msg += 4; 380 | } 381 | char buf[1024]; 382 | vsnprintf(buf, sizeof(buf), msg, args); 383 | meta_printf("message:%s\n", buf); 384 | if (!silent) 385 | { 386 | fputs(buf, stderr); 387 | fputc('\n', stderr); 388 | } 389 | box_exit(1); 390 | } 391 | 392 | /* Write a message, but only if in verbose mode */ 393 | void __attribute__((format(printf,1,2))) 394 | msg(char *msg, ...) 395 | { 396 | va_list args; 397 | va_start(args, msg); 398 | if (verbose) 399 | { 400 | int len = strlen(msg); 401 | if (len > 0) 402 | partial_line = (msg[len-1] != '\n'); 403 | vfprintf(stderr, msg, args); 404 | fflush(stderr); 405 | } 406 | va_end(args); 407 | } 408 | 409 | /*** Signal handling in keeper process ***/ 410 | 411 | /* 412 | * Signal handling is tricky. We must set up signal handlers before 413 | * we start the child process (and reset them in the child process). 414 | * Otherwise, there is a short time window where a SIGINT can kill 415 | * us and leave the child process running. 416 | */ 417 | 418 | struct signal_rule { 419 | int signum; 420 | enum { SIGNAL_IGNORE, SIGNAL_INTERRUPT, SIGNAL_FATAL } action; 421 | }; 422 | 423 | static const struct signal_rule signal_rules[] = { 424 | { SIGHUP, SIGNAL_INTERRUPT }, 425 | { SIGINT, SIGNAL_INTERRUPT }, 426 | { SIGQUIT, SIGNAL_INTERRUPT }, 427 | { SIGILL, SIGNAL_FATAL }, 428 | { SIGABRT, SIGNAL_FATAL }, 429 | { SIGFPE, SIGNAL_FATAL }, 430 | { SIGSEGV, SIGNAL_FATAL }, 431 | { SIGPIPE, SIGNAL_IGNORE }, 432 | { SIGTERM, SIGNAL_INTERRUPT }, 433 | { SIGUSR1, SIGNAL_IGNORE }, 434 | { SIGUSR2, SIGNAL_IGNORE }, 435 | { SIGBUS, SIGNAL_FATAL }, 436 | { SIGTTOU, SIGNAL_IGNORE }, 437 | }; 438 | 439 | static void 440 | signal_alarm(int unused UNUSED) 441 | { 442 | /* Time limit checks are synchronous, so we only schedule them there. */ 443 | timer_tick = 1; 444 | msg("[timer]"); 445 | } 446 | 447 | static void 448 | signal_int(int signum) 449 | { 450 | /* Interrupts (e.g., SIGINT) are synchronous, too. */ 451 | interrupt = signum; 452 | } 453 | 454 | static void 455 | signal_fatal(int signum) 456 | { 457 | /* If we receive SIGSEGV or a similar signal, we try to die gracefully. */ 458 | die("Sandbox keeper received fatal signal %d", signum); 459 | } 460 | 461 | static void 462 | setup_signals(void) 463 | { 464 | struct sigaction sa_int, sa_fatal; 465 | bzero(&sa_int, sizeof(sa_int)); 466 | sa_int.sa_handler = signal_int; 467 | bzero(&sa_fatal, sizeof(sa_fatal)); 468 | sa_fatal.sa_handler = signal_fatal; 469 | 470 | for (int i=0; i < ARRAY_SIZE(signal_rules); i++) 471 | { 472 | const struct signal_rule *sr = &signal_rules[i]; 473 | switch (sr->action) 474 | { 475 | case SIGNAL_IGNORE: 476 | signal(sr->signum, SIG_IGN); 477 | break; 478 | case SIGNAL_INTERRUPT: 479 | sigaction(sr->signum, &sa_int, NULL); 480 | break; 481 | case SIGNAL_FATAL: 482 | sigaction(sr->signum, &sa_fatal, NULL); 483 | break; 484 | default: 485 | die("Invalid signal rule"); 486 | } 487 | } 488 | } 489 | 490 | static void 491 | reset_signals(void) 492 | { 493 | for (int i=0; i < ARRAY_SIZE(signal_rules); i++) 494 | signal(signal_rules[i].signum, SIG_DFL); 495 | } 496 | 497 | /*** The keeper process ***/ 498 | 499 | #define PROC_BUF_SIZE 4096 500 | static int 501 | read_proc_file(char *buf, char *name, int *fdp) 502 | { 503 | int c; 504 | 505 | if (*fdp < 0) 506 | { 507 | snprintf(buf, PROC_BUF_SIZE, "/proc/%d/%s", (int) box_pid, name); 508 | *fdp = open(buf, O_RDONLY); 509 | if (*fdp < 0) 510 | return 0; // This is OK, the process could have finished 511 | } 512 | lseek(*fdp, 0, SEEK_SET); 513 | if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0) 514 | { 515 | // Even this could fail if the process disappeared since open() 516 | return 0; 517 | } 518 | if (c >= PROC_BUF_SIZE-1) 519 | die("/proc/$pid/%s too long", name); 520 | buf[c] = 0; 521 | return 1; 522 | } 523 | 524 | static int 525 | get_wall_time_ms(void) 526 | { 527 | struct timespec now, wall; 528 | clock_gettime(CLOCK_MONOTONIC, &now); 529 | timespec_sub(&now, &start_time, &wall); 530 | return wall.tv_sec*1000 + wall.tv_nsec/1000000; 531 | } 532 | 533 | static int 534 | get_run_time_ms(struct rusage *rus) 535 | { 536 | if (cg_enable) 537 | return cg_get_run_time_ms(); 538 | 539 | if (rus) 540 | { 541 | struct timeval total; 542 | timeradd(&rus->ru_utime, &rus->ru_stime, &total); 543 | return total.tv_sec*1000 + total.tv_usec/1000; 544 | } 545 | 546 | // It might happen that we do not know the box_pid (see comments in find_box_pid()) 547 | if (!box_pid) 548 | return 0; 549 | 550 | char buf[PROC_BUF_SIZE], *x; 551 | int utime, stime; 552 | static int proc_stat_fd = -1; 553 | 554 | if (!read_proc_file(buf, "stat", &proc_stat_fd)) 555 | return 0; 556 | x = buf; 557 | while (*x && *x != ' ') 558 | x++; 559 | while (*x == ' ') 560 | x++; 561 | if (*x++ != '(') 562 | die("proc stat syntax error 1"); 563 | while (*x && (*x != ')' || x[1] != ' ')) 564 | x++; 565 | while (*x == ')' || *x == ' ') 566 | x++; 567 | if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2) 568 | die("proc stat syntax error 2"); 569 | 570 | return (utime + stime) * 1000 / ticks_per_sec; 571 | } 572 | 573 | static void 574 | check_timeout(void) 575 | { 576 | if (wall_timeout) 577 | { 578 | int wall_ms = get_wall_time_ms(); 579 | if (wall_ms > wall_timeout) 580 | err("TO: Time limit exceeded (wall clock)"); 581 | if (verbose > 1) 582 | fprintf(stderr, "[wall time check: %d msec]\n", wall_ms); 583 | } 584 | if (timeout) 585 | { 586 | int ms = get_run_time_ms(NULL); 587 | if (verbose > 1) 588 | fprintf(stderr, "[time check: %d msec]\n", ms); 589 | if (ms > timeout && ms > extra_timeout) 590 | err("TO: Time limit exceeded"); 591 | } 592 | } 593 | 594 | static void 595 | box_keeper(void) 596 | { 597 | read_errors_from_fd = error_pipes[0]; 598 | close(error_pipes[1]); 599 | close(status_pipes[1]); 600 | 601 | clock_gettime(CLOCK_MONOTONIC, &start_time); 602 | ticks_per_sec = sysconf(_SC_CLK_TCK); 603 | if (ticks_per_sec <= 0) 604 | die("Invalid ticks_per_sec!"); 605 | 606 | if (timeout || wall_timeout) 607 | { 608 | struct sigaction sa; 609 | bzero(&sa, sizeof(sa)); 610 | sa.sa_handler = signal_alarm; 611 | sigaction(SIGALRM, &sa, NULL); 612 | struct itimerval timer = { 613 | .it_interval = { .tv_usec = TIMER_INTERVAL_US }, 614 | .it_value = { .tv_usec = TIMER_INTERVAL_US }, 615 | }; 616 | setitimer(ITIMER_REAL, &timer, NULL); 617 | } 618 | 619 | for(;;) 620 | { 621 | struct rusage rus; 622 | int stat; 623 | pid_t p; 624 | if (interrupt) 625 | { 626 | meta_printf("exitsig:%d\n", interrupt); 627 | err("SG: Interrupted"); 628 | } 629 | if (timer_tick) 630 | { 631 | check_timeout(); 632 | timer_tick = 0; 633 | } 634 | p = wait4(proxy_pid, &stat, 0, &rus); 635 | if (p < 0) 636 | { 637 | if (errno == EINTR) 638 | continue; 639 | die("wait4: %m"); 640 | } 641 | if (p != proxy_pid) 642 | die("wait4: unknown pid %d exited!", p); 643 | proxy_pid = 0; 644 | 645 | // Check error pipe if there is an internal error passed from inside the box 646 | char interr[1024]; 647 | int n = read(read_errors_from_fd, interr, sizeof(interr) - 1); 648 | if (n > 0) 649 | { 650 | interr[n] = 0; 651 | die("%s", interr); 652 | } 653 | 654 | // Check status pipe if there is an exit status reported by the proxy process 655 | n = read(status_pipes[0], &stat, sizeof(stat)); 656 | if (n != sizeof(stat)) 657 | die("Did not receive exit status from proxy"); 658 | 659 | // At this point, the rusage includes time spent by the proxy's children. 660 | final_stats(&rus); 661 | if (timeout && total_ms > timeout) 662 | err("TO: Time limit exceeded"); 663 | if (wall_timeout && wall_ms > wall_timeout) 664 | err("TO: Time limit exceeded (wall clock)"); 665 | 666 | if (WIFEXITED(stat)) 667 | { 668 | meta_printf("exitcode:%d\n", WEXITSTATUS(stat)); 669 | if (WEXITSTATUS(stat)) 670 | err("RE: Exited with error status %d", WEXITSTATUS(stat)); 671 | flush_line(); 672 | if (!silent) 673 | { 674 | fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n", 675 | total_ms/1000, total_ms%1000, 676 | wall_ms/1000, wall_ms%1000); 677 | } 678 | box_exit(0); 679 | } 680 | else if (WIFSIGNALED(stat)) 681 | { 682 | meta_printf("exitsig:%d\n", WTERMSIG(stat)); 683 | err("SG: Caught fatal signal %d", WTERMSIG(stat)); 684 | } 685 | else if (WIFSTOPPED(stat)) 686 | { 687 | meta_printf("exitsig:%d\n", WSTOPSIG(stat)); 688 | err("SG: Stopped by signal %d", WSTOPSIG(stat)); 689 | } 690 | else 691 | die("wait4: unknown status %x, giving up!", stat); 692 | } 693 | } 694 | 695 | /*** The process running inside the box ***/ 696 | 697 | static void 698 | setup_root(void) 699 | { 700 | if (mkdir("root", 0750) < 0 && errno != EEXIST) 701 | die("mkdir('root'): %m"); 702 | 703 | /* 704 | * Ensure all mounts are private, not shared. We don't want our mounts 705 | * appearing outside of our namespace. 706 | * (systemd since version 188 mounts filesystems shared by default). 707 | */ 708 | if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) 709 | die("Cannot privatize mounts: %m"); 710 | 711 | if (mount("none", "root", "tmpfs", 0, "mode=755") < 0) 712 | die("Cannot mount root ramdisk: %m"); 713 | 714 | apply_dir_rules(default_dirs); 715 | 716 | if (chroot("root") < 0) 717 | die("Chroot failed: %m"); 718 | 719 | if (chdir("root/box") < 0) 720 | die("Cannot change current directory: %m"); 721 | } 722 | 723 | static void 724 | setup_net(void) 725 | { 726 | if (share_net) 727 | return; 728 | 729 | int fd = socket(PF_INET, SOCK_DGRAM, 0); 730 | if (fd < 0) 731 | die("Cannot create PF_INET socket: %m"); 732 | 733 | struct ifreq ifr = { .ifr_name = "lo" }; 734 | if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0) 735 | die("SIOCGIFFLAGS on 'lo' failed: %m"); 736 | 737 | ifr.ifr_flags |= IFF_UP; 738 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) 739 | die("SIOCSIFFLAGS on 'lo' failed: %m"); 740 | 741 | close(fd); 742 | } 743 | 744 | static void 745 | setup_credentials(void) 746 | { 747 | if (setresgid(box_gid, box_gid, box_gid) < 0) 748 | die("setresgid: %m"); 749 | if (setgroups(0, NULL) < 0) 750 | die("setgroups: %m"); 751 | if (setresuid(box_uid, box_uid, box_uid) < 0) 752 | die("setresuid: %m"); 753 | setpgrp(); 754 | if (tty_hack && isatty(1)) 755 | { 756 | // If stdout is a tty, make us the foreground process group 757 | signal(SIGTTOU, SIG_IGN); 758 | tcsetpgrp(1, getpgrp()); 759 | signal(SIGTTOU, SIG_DFL); 760 | } 761 | } 762 | 763 | static void 764 | setup_fds(void) 765 | { 766 | if (redir_stdin) 767 | { 768 | close(0); 769 | if (open(redir_stdin, O_RDONLY) != 0) 770 | die("open(\"%s\"): %m", redir_stdin); 771 | } 772 | if (redir_stdout) 773 | { 774 | close(1); 775 | if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1) 776 | die("open(\"%s\"): %m", redir_stdout); 777 | } 778 | if (redir_stderr) 779 | { 780 | close(2); 781 | if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2) 782 | die("open(\"%s\"): %m", redir_stderr); 783 | } 784 | if (redir_stderr_to_stdout) 785 | { 786 | if (dup2(1, 2) < 0) 787 | die("Cannot dup stdout to stderr: %m"); 788 | } 789 | } 790 | 791 | static void 792 | setup_rlim(const char *res_name, int res, rlim_t limit) 793 | { 794 | struct rlimit rl = { .rlim_cur = limit, .rlim_max = limit }; 795 | if (setrlimit(res, &rl) < 0) 796 | die("setrlimit(%s, %jd)", res_name, (intmax_t) limit); 797 | } 798 | 799 | static void 800 | setup_rlimits(void) 801 | { 802 | #define RLIM(res, val) setup_rlim("RLIMIT_" #res, RLIMIT_##res, val) 803 | 804 | if (memory_limit) 805 | RLIM(AS, (rlim_t)memory_limit * 1024); 806 | 807 | if (fsize_limit) 808 | RLIM(FSIZE, (rlim_t)fsize_limit * 1024); 809 | 810 | if (open_file_limit) 811 | RLIM(NOFILE, (rlim_t)open_file_limit); 812 | 813 | RLIM(STACK, (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY)); 814 | RLIM(MEMLOCK, 0); 815 | RLIM(CORE, (rlim_t)core_limit * 1024); 816 | 817 | if (max_processes) 818 | RLIM(NPROC, max_processes); 819 | 820 | #undef RLIM 821 | } 822 | 823 | static int 824 | box_inside(char **args) 825 | { 826 | cg_enter(); 827 | setup_root(); 828 | setup_net(); 829 | setup_rlimits(); 830 | setup_credentials(); 831 | setup_fds(); 832 | char **env = setup_environment(); 833 | 834 | if (set_cwd && chdir(set_cwd)) 835 | die("chdir: %m"); 836 | 837 | execve(args[0], args, env); 838 | fprintf(stderr, "execve(\"%s\"): %m\n", args[0]); 839 | exit(127); 840 | } 841 | 842 | /*** Proxy ***/ 843 | 844 | static void 845 | setup_orig_credentials(void) 846 | { 847 | if (setresgid(orig_gid, orig_gid, orig_gid) < 0) 848 | die("setresgid: %m"); 849 | if (setgroups(0, NULL) < 0) 850 | die("setgroups: %m"); 851 | if (setresuid(orig_uid, orig_uid, orig_uid) < 0) 852 | die("setresuid: %m"); 853 | } 854 | 855 | static int 856 | box_proxy(void *arg) 857 | { 858 | char **args = arg; 859 | 860 | write_errors_to_fd = error_pipes[1]; 861 | close(error_pipes[0]); 862 | close(status_pipes[0]); 863 | meta_close(); 864 | lock_close(); 865 | reset_signals(); 866 | 867 | pid_t inside_pid = fork(); 868 | if (inside_pid < 0) 869 | die("Cannot run process, fork failed: %m"); 870 | else if (!inside_pid) 871 | { 872 | close(status_pipes[1]); 873 | box_inside(args); 874 | _exit(42); // We should never get here 875 | } 876 | 877 | setup_orig_credentials(); 878 | if (write(status_pipes[1], &inside_pid, sizeof(inside_pid)) != sizeof(inside_pid)) 879 | die("Proxy write to pipe failed: %m"); 880 | 881 | int stat; 882 | pid_t p = waitpid(inside_pid, &stat, 0); 883 | if (p < 0) 884 | die("Proxy waitpid() failed: %m"); 885 | 886 | if (write(status_pipes[1], &stat, sizeof(stat)) != sizeof(stat)) 887 | die("Proxy write to pipe failed: %m"); 888 | 889 | _exit(0); 890 | } 891 | 892 | static void 893 | box_init(void) 894 | { 895 | if (box_id < 0 || box_id >= cf_num_boxes) 896 | die("Sandbox ID out of range (allowed: 0-%d)", cf_num_boxes-1); 897 | box_uid = cf_first_uid + box_id; 898 | box_gid = cf_first_gid + box_id; 899 | 900 | snprintf(box_dir, sizeof(box_dir), "%s/%d", cf_box_root, box_id); 901 | } 902 | 903 | /*** Commands ***/ 904 | 905 | static const char * 906 | self_name(void) 907 | { 908 | return cg_enable ? "isolate --cg" : "isolate"; 909 | } 910 | 911 | static void 912 | get_credentials(void) 913 | { 914 | if (geteuid()) 915 | die("Must be started as root"); 916 | if (getegid() && setegid(0) < 0) 917 | die("Cannot switch to root group: %m"); 918 | 919 | orig_uid = getuid(); 920 | orig_gid = getgid(); 921 | invoked_by_root = !orig_uid; 922 | 923 | if (as_uid >= 0 || as_gid >= 0) 924 | { 925 | if (!invoked_by_root) 926 | die("You must be root to use --as-uid or --as-gid"); 927 | if (as_uid < 0 || as_gid < 0) 928 | die("--as-uid and --as-gid must be used either both or none"); 929 | orig_uid = as_uid; 930 | orig_gid = as_gid; 931 | } 932 | } 933 | 934 | static void 935 | do_cleanup(void) 936 | { 937 | if (dir_exists(box_dir)) 938 | { 939 | msg("Removing box directory\n"); 940 | rmtree(box_dir); 941 | } 942 | cg_remove(); 943 | } 944 | 945 | static void 946 | init(void) 947 | { 948 | if (cf_restricted_init && !invoked_by_root) 949 | die("New sandboxes can be created only by root"); 950 | 951 | lock_box(true); 952 | 953 | do_cleanup(); 954 | 955 | msg("Preparing sandbox\n"); 956 | make_dir(box_dir); 957 | if (chdir(box_dir) < 0) 958 | die("chdir(%s): %m", box_dir); 959 | if (mkdir("box", 0700) < 0) 960 | die("Cannot create box: %m"); 961 | if (chown("box", orig_uid, orig_gid) < 0) 962 | die("Cannot chown box: %m"); 963 | 964 | cg_create(); 965 | set_quota(); 966 | 967 | lock.is_initialized = 1; 968 | lock_write(); 969 | 970 | puts(box_dir); 971 | } 972 | 973 | static void 974 | cleanup(void) 975 | { 976 | if (!lock_box(false)) 977 | msg("Nothing to do -- box did not exist\n"); 978 | else 979 | { 980 | msg("Deleting sandbox\n"); 981 | do_cleanup(); 982 | lock_remove(); 983 | } 984 | } 985 | 986 | static void 987 | setup_pipe(int *fds, int nonblocking) 988 | { 989 | if (pipe(fds) < 0) 990 | die("pipe: %m"); 991 | for (int i=0; i<2; i++) 992 | if (fcntl(fds[i], F_SETFD, fcntl(fds[i], F_GETFD) | FD_CLOEXEC) < 0 || 993 | nonblocking && fcntl(fds[i], F_SETFL, fcntl(fds[i], F_GETFL) | O_NONBLOCK) < 0) 994 | die("fcntl on pipe: %m"); 995 | } 996 | 997 | static void 998 | find_box_pid(void) 999 | { 1000 | /* 1001 | * The box keeper process wants to poll status of the inside process, 1002 | * so it needs to know the box_pid. However, it is not easy to obtain: 1003 | * we got the PID from the proxy, but it is local to the PID namespace. 1004 | * Instead, we ask /proc to enumerate the children of the proxy. 1005 | * 1006 | * CAVEAT: The timing is tricky. We know that the inside process was 1007 | * already started (passing the PID from the proxy to us guarantees it), 1008 | * but it might already have exited and be reaped by the proxy. Therefore 1009 | * it is correct if we fail to find anything. 1010 | */ 1011 | 1012 | char namebuf[256]; 1013 | snprintf(namebuf, sizeof(namebuf), "/proc/%d/task/%d/children", (int) proxy_pid, (int) proxy_pid); 1014 | FILE *f = fopen(namebuf, "r"); 1015 | if (!f) 1016 | return; 1017 | 1018 | int child; 1019 | if (fscanf(f, "%d", &child) != 1) 1020 | { 1021 | fclose(f); 1022 | return; 1023 | } 1024 | box_pid = child; 1025 | 1026 | if (fscanf(f, "%d", &child) == 1) 1027 | die("Error parsing %s: unexpected children found", namebuf); 1028 | 1029 | fclose(f); 1030 | } 1031 | 1032 | static void 1033 | run(char **argv) 1034 | { 1035 | if (!lock_box(false)) 1036 | die("Box not found, did you run `%s --init'?", self_name()); 1037 | 1038 | if (chdir(box_dir) < 0) 1039 | die("chdir(%s): %m", box_dir); 1040 | 1041 | if (!inherit_fds) 1042 | { 1043 | keep_fd(lock_fd); 1044 | close_all_fds(); 1045 | } 1046 | 1047 | chowntree("box", box_uid, box_gid, false); 1048 | cleanup_ownership = 1; 1049 | 1050 | setup_pipe(error_pipes, 1); 1051 | setup_pipe(status_pipes, 0); 1052 | setup_signals(); 1053 | cg_setup(); 1054 | 1055 | proxy_pid = clone( 1056 | box_proxy, // Function to execute as the body of the new process 1057 | (void*)((uintptr_t)argv & ~(uintptr_t)15), // Pass our stack, aligned to 16-bytes 1058 | SIGCHLD | CLONE_NEWIPC | (share_net ? 0 : CLONE_NEWNET) | CLONE_NEWNS | CLONE_NEWPID, 1059 | argv); // Pass the arguments 1060 | if (proxy_pid < 0) 1061 | die("Cannot run proxy, clone failed: %m"); 1062 | if (!proxy_pid) 1063 | die("Cannot run proxy, clone returned 0"); 1064 | 1065 | pid_t box_pid_inside_ns; 1066 | int n = read(status_pipes[0], &box_pid_inside_ns, sizeof(box_pid_inside_ns)); 1067 | if (n != sizeof(box_pid_inside_ns)) 1068 | die("Proxy failed before it passed box_pid: %m"); 1069 | find_box_pid(); 1070 | msg("Started proxy_pid=%d box_pid=%d box_pid_inside_ns=%d\n", (int) proxy_pid, (int) box_pid, (int) box_pid_inside_ns); 1071 | 1072 | box_keeper(); 1073 | } 1074 | 1075 | static void 1076 | show_version(void) 1077 | { 1078 | printf("The process isolator " ISOLATE_VERSION "\n"); 1079 | printf("(c) 2012--" ISOLATE_YEAR " Martin Mares and Bernard Blackham\n"); 1080 | #if defined(BUILD_DATE) && defined(BUILD_COMMIT) 1081 | printf("Built on " BUILD_DATE " from Git commit " BUILD_COMMIT "\n"); 1082 | #endif 1083 | } 1084 | 1085 | /*** Options ***/ 1086 | 1087 | static void __attribute__((format(printf,1,2))) 1088 | usage(const char *msg, ...) 1089 | { 1090 | if (msg != NULL) 1091 | { 1092 | va_list args; 1093 | va_start(args, msg); 1094 | vfprintf(stderr, msg, args); 1095 | va_end(args); 1096 | } 1097 | printf("\ 1098 | Usage: isolate [] \n\ 1099 | \n\ 1100 | Options:\n\ 1101 | --as-uid=\tPerform action on behalf of a given user (requires root)\n\ 1102 | --as-gid=\tPerform action on behalf of a given group (requires root)\n\ 1103 | -b, --box-id=\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\ 1104 | --cg\t\tEnable use of control groups\n\ 1105 | --cg-mem=\tLimit memory usage of the control group to KB\n\ 1106 | -c, --chdir=\tChange directory to before executing the program\n\ 1107 | --core=\tLimit core files to KB (default: 0)\n\ 1108 | -d, --dir=\t\tMake a directory visible inside the sandbox\n\ 1109 | --dir==\tMake a directory outside visible as inside\n\ 1110 | --dir==\t\tDelete a previously defined directory rule (even a default one)\n\ 1111 | --dir=...:\tSpecify options for a rule:\n\ 1112 | \t\t\t\tdev\tAllow access to block/char devices\n\ 1113 | \t\t\t\tfs\tMount a filesystem (e.g., --dir=/proc:proc:fs)\n\ 1114 | \t\t\t\tmaybe\tSkip the rule if does not exist\n\ 1115 | \t\t\t\tnoexec\tDo not allow execution of binaries\n\ 1116 | \t\t\t\tnorec\tDo not bind the directory recursively\n\ 1117 | \t\t\t\trw\tAllow read-write access\n\ 1118 | \t\t\t\ttmp\tCreate as a temporary directory (implies rw)\n\ 1119 | -D, --no-default-dirs\tDo not add default directory rules\n\ 1120 | -f, --fsize=\tMax size (in KB) of files that can be created\n\ 1121 | -E, --env=\t\tInherit the environment variable from the parent process\n\ 1122 | -E, --env==\tSet the environment variable to ; unset it if is empty\n\ 1123 | -x, --extra-time=