├── .gitignore ├── Makefile ├── README.md ├── TODO ├── config.h ├── fix_mod.sh ├── isolate.1.txt └── isolate.c /.gitignore: -------------------------------------------------------------------------------- 1 | docbook-xsl.css 2 | isolate 3 | isolate.1 4 | isolate.1.html 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Isolate 2 | # (c) 2015 Martin Mares 3 | 4 | all: isolate isolate.1 isolate.1.html 5 | 6 | CC=gcc 7 | CFLAGS=-std=gnu99 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers 8 | 9 | VERSION=1.1 10 | YEAR=2015 11 | BUILD_DATE:=$(shell date '+%Y-%m-%d') 12 | BUILD_COMMIT:=$(shell if git rev-parse >/dev/null 2>/dev/null ; then git describe --always ; else echo '' ; fi) 13 | CFLAGS += -DVERSION='"$(VERSION)"' -DYEAR='"$(YEAR)"' -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"' 14 | 15 | PREFIX = $(DESTDIR)/usr/local 16 | BINDIR = $(PREFIX)/bin 17 | DATAROOTDIR = $(PREFIX)/share 18 | DATADIR = $(DATAROOTDIR) 19 | MANDIR = $(DATADIR)/man 20 | MAN1DIR = $(MANDIR)/man1 21 | 22 | isolate: isolate.c config.h 23 | $(CC) $(CFLAGS) -o $@ $^ 24 | 25 | isolate.1: isolate.1.txt 26 | a2x -f manpage -D . $< 27 | 28 | # The dependency on isolate.1 is there to serialize both calls of asciidoc, 29 | # which does not name temporary files safely. 30 | isolate.1.html: isolate.1.txt isolate.1 31 | a2x -f xhtml -D . $< 32 | 33 | clean: 34 | rm -f isolate isolate.1 isolate.1.html 35 | rm -f docbook-xsl.css 36 | 37 | install: isolate 38 | install -D $< $(BINDIR)/$< 39 | chmod u+s $(BINDIR)/$< 40 | 41 | install-doc: isolate.1 42 | install -D $< $(MAN1DIR)/$< 43 | 44 | .PHONY: all clean install install-doc 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Important note 2 | ============== 3 | 4 | This repository is only here for historical reasons. It is not used nor updated. 5 | The authoritative version of the isolate sandbox, also used in CMS, is 6 | [ioi/isolate](https://github.com/ioi/isolate). 7 | 8 | isolate 9 | ======= 10 | 11 | Isolate is a sandbox built to safely run untrusted executables, 12 | offering them a limited-access environment and preventing them from 13 | affecting the host system. It takes advantage of features specific to 14 | the Linux kernel, like namespaces and control groups. 15 | 16 | Isolate was developed by Martin Mareš () and Bernard Blackham 17 | (), who still maintain it. Several other people 18 | contributed patches for features and bug fixes (see Git history for a list). 19 | Thanks! 20 | 21 | Originally, Isolate was a part of the [Moe Contest Environment](http://www.ucw.cz/moe/), 22 | but it evolved to a separate project used by different 23 | contest systems, most prominently [CMS](https://github.com/cms-dev/cms). 24 | It now lives at [GitHub](https://github.com/ioi/isolate), 25 | where you can submit bug reports and feature requests. 26 | 27 | If you are interested in more details, please read Martin's 28 | and Bernard's [paper](http://mj.ucw.cz/papers/isolate.pdf) presented 29 | at the IOI Conference. Also, Isolate's [manual page](http://www.ucw.cz/moe/isolate.1.html) 30 | is available online. 31 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | Installation 2 | Test: ptrace self 3 | Test: SIGSTOP 4 | Test: ping-pong timing attacks 5 | Test: big static memory 6 | Examine the use of taskstats for measuring memory 7 | Doc: mount -t cgroup none -o cpuset,cpuacct,memory /sys/fs/cgroup 8 | Switch license to GPL2/GPL3 9 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | #ifndef __ISOLATE_CONFIG_H__ 2 | #define __ISOLATE_CONFIG_H__ 3 | 4 | /* A directory under which all sandboxes are created. */ 5 | #define CONFIG_ISOLATE_BOX_DIR "/tmp/box" 6 | 7 | /* Range of UIDs and GIDs reserved for use by the sandboxes. */ 8 | #define CONFIG_ISOLATE_FIRST_UID 60000 9 | #define CONFIG_ISOLATE_FIRST_GID 60000 10 | #define CONFIG_ISOLATE_NUM_BOXES 100 11 | 12 | /* Root of the cgroup hierarchy. */ 13 | #define CONFIG_ISOLATE_CGROUP_ROOT "/sys/fs/cgroup" 14 | 15 | #endif /* __ISOLATE_CONFIG_H__ */ 16 | -------------------------------------------------------------------------------- /fix_mod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Remember to execute as root :-) 4 | 5 | chown root isolate 6 | chmod u+s isolate 7 | 8 | -------------------------------------------------------------------------------- /isolate.1.txt: -------------------------------------------------------------------------------- 1 | ISOLATE(1) 2 | ========== 3 | 4 | NAME 5 | ---- 6 | isolate - Isolate a process using Linux Containers 7 | 8 | SYNOPSIS 9 | -------- 10 | *isolate* 'options' *--init* 11 | 12 | *isolate* 'options' *--run* +--+ 'program' 'arguments' 13 | 14 | *isolate* 'options' *--cleanup* 15 | 16 | DESCRIPTION 17 | ----------- 18 | Run 'program' within a sandbox, so that it cannot communicate with the 19 | outside world and its resource consumption is limited. This can be used 20 | for example in a programming contest to run untrusted programs submitted 21 | by contestants in a controlled environment. 22 | 23 | The sandbox is used in the following way: 24 | 25 | * Run *isolate --init*, which initializes the sandbox, creates its working directory and 26 | prints its name to the standard output. 27 | 28 | * Populate the directory with the executable file of the program and its 29 | input files. 30 | 31 | * Call *isolate --run* to run the program. A single line describing the 32 | status of the program is written to the standard error stream. 33 | 34 | * Fetch the output of the program from the directory. 35 | 36 | * Run *isolate --cleanup* to remove temporary files. 37 | 38 | Please note that by default, the program is not allowed to start multiple 39 | processes of threads. If you need that, turn on the control group mode 40 | (see below). 41 | 42 | OPTIONS 43 | ------- 44 | *-M, --meta=*'file':: 45 | Output meta-data on the execution of the program to a given file. 46 | See below for syntax of the meta-files. 47 | 48 | *-m, --mem=*'size':: 49 | Limit address space of the program to 'size' kilobytes. If more processes 50 | are allowed, this applies to each of them separately. 51 | 52 | *-t, --time=*'time':: 53 | Limit run time of the program to 'time' seconds. Fractional numbers are allowed. 54 | Time in which the OS assigns the processor to different tasks is not counted. 55 | 56 | *-w, --wall-time=*'time':: 57 | Limit wall-clock time to 'time' seconds. Fractional values are allowed. 58 | This clock measures the time from the start of the program to its exit, 59 | so it does not stop when the program has lost the CPU or when it is waiting 60 | for an external event. We recommend to use *--time* as the main limit, 61 | but set *--wall-time* to a much higher value as a precaution against 62 | sleeping programs. 63 | 64 | *-x, --extra-time=*'time':: 65 | When a time limit is exceeded, wait for extra 'time' seconds before 66 | killing the program. This has the advantage that the real execution time 67 | is reported, even though it slightly exceeds the limit. Fractional 68 | numbers are again allowed. 69 | 70 | *-b, --box-id=*'id':: 71 | When you run multiple sandboxes in parallel, you have to assign each unique 72 | IDs to them by this option. See the discussion on UIDs in the INSTALLATION 73 | section. The ID defaults to 0. 74 | 75 | *-k, --stack=*'size':: 76 | Limit process stack to 'size' kilobytes. By default, the whole address 77 | space is available for the stack, but it is subject to the *--mem* limit. 78 | 79 | *-f, --fsize=*'size':: 80 | Limit size of files created (or modified) by the program to 'size' kilobytes. 81 | In most cases, it is better to restrict overall disk usage by a disk quota 82 | (see below). This option can help in cases when quotas are not enabled 83 | on the underlying filesystem. 84 | 85 | *-q, --quota=*'blocks'*,*'inodes':: 86 | Set disk quota to a given number of blocks and inodes. This requires the 87 | filesystem to be mounted with support for quotas. 88 | 89 | *-i, --stdin=*'file':: 90 | Redirect standard input from 'file'. The 'file' has to be accessible 91 | inside the sandbox. 92 | 93 | *-o, --stdout=*'file':: 94 | Redirect standard output to 'file'. The 'file' has to be accessible 95 | inside the sandbox. 96 | 97 | *-r, --stderr=*'file':: 98 | Redirect standard error output to 'file'. The 'file' has to be accessible 99 | inside the sandbox. 100 | 101 | *-c, --chdir=*'dir':: 102 | Change directory to 'dir' before executing the program. This path must be 103 | relative to the root of the sandbox. 104 | 105 | *-p, --processes*[*=*'max']:: 106 | Permit the program to create up to 'max' processes and/or threads. Please 107 | keep in mind that time and memory limit do not work with multiple processes 108 | unless you enable the control group mode. If 'max' is not given, an arbitrary 109 | number of processes can be run. By default, only one process is permitted. 110 | 111 | *--share-net*:: 112 | By default, isolate creates a new network namespace for its child process. 113 | This namespace contains no network devices except for a per-namespace loopback. 114 | This prevents the program from communicating with the outside world. If you want 115 | to permit communication, you can use this switch to keep the child process 116 | in parent's network namespace. 117 | 118 | *-v, --verbose*:: 119 | Tell the sandbox manager to be verbose and report on what is going on. 120 | Using *-v* multiple times produces even more jabber. 121 | 122 | ENVIRONMENT RULES 123 | ----------------- 124 | UNIX processes normally inherit all environment variables from their parent. The 125 | sandbox however passes only those variables which are explicitly requested by 126 | environment rules: 127 | 128 | *-E, --env=*'var':: 129 | Inherit the variable 'var' from the parent. 130 | 131 | *-E, --env=*'var'*=*'value':: 132 | Set the variable 'var' to 'value'. When the 'value' is empty, the 133 | variable is removed from the environment. 134 | 135 | *-e, --full-env*:: 136 | Inherit all variables from the parent. 137 | 138 | The rules are applied in the order in which they were given, except for 139 | *--full-env*, which is applied first. 140 | 141 | The list of rules is automatically initialized with *-ELIBC_FATAL_STDERR_=1*. 142 | 143 | DIRECTORY RULES 144 | --------------- 145 | The sandboxed process gets its own filesystem namespace, which contains only subtrees 146 | requested by directory rules: 147 | 148 | *-d, --dir=*'in'*=*'out'[*:*'options']:: 149 | Bind the directory 'out' as seen by the caller to the path 'in' inside the sandbox. 150 | If there already was a directory rule for 'out', it is replaced. 151 | 152 | *-d, --dir=*'dir'[*:*'options']:: 153 | Bind the directory +/+'dir' to 'dir' inside the sandbox. 154 | If there already was a directory rule for 'out', it is replaced. 155 | 156 | *-d, --dir=*'in'*=*:: 157 | Remove a directory rule for the path 'in' inside the sandbox. 158 | 159 | By default, all directories are bound read-only and restricted (no devices, 160 | no setuid binaries). This behavior can be modified using the 'options': 161 | 162 | *rw*:: 163 | Allow read-write access. 164 | 165 | *dev*:: 166 | Allow access to character and block devices. 167 | 168 | *noexec*:: 169 | Disallow execution of binaries. 170 | 171 | *maybe*:: 172 | Silently ignore the rule if the directory to be bound does not exist. 173 | 174 | *fs*:: 175 | Instead of binding a directory, mount a device-less filesystem called 'in'. 176 | For example, this can be 'proc' or 'sysfs'. 177 | 178 | The default set of directory rules binds +/bin+, +/dev+ (with devices allowed), +/lib+, 179 | +/lib64+ (if it exists), and +/usr+. It also binds the working directory to +/box+ (read-write) 180 | and mounts the proc filesystem at +/proc+. 181 | 182 | CONTROL GROUPS 183 | -------------- 184 | Isolate can make use of system control groups provided by the kernel 185 | to constrain programs consisting of multiple processes. Please note 186 | that this feature needs special system setup described in the REQUIREMENTS 187 | section. 188 | 189 | *--cg*:: 190 | Enable use of control groups. This should be specified with *--init*, 191 | *--run* and *--cleanup*. 192 | 193 | *--cg-mem=*'size':: 194 | Limit total memory usage by the whole control group to 'size' kilobytes. 195 | This should be specified with *--run*. 196 | 197 | *--cg-timing*:: 198 | Use control groups for timing, so that the *--time* switch affects the 199 | total run time of all processes and threads in the control group. 200 | This should be specified with *--run*. 201 | 202 | META-FILES 203 | ---------- 204 | The meta-file contains miscellaneous meta-information on execution of the 205 | program within the sandbox. It is a textual file consisting of lines 206 | of format 'key'*:*'value'. The following keys are defined: 207 | 208 | *cg-mem*:: 209 | When control groups are enabled, this is the total memory use 210 | by the whole control group (in kilobytes). 211 | *csw-forced*:: 212 | Number of context switches forced by the kernel. 213 | *csw-voluntary*:: 214 | Number of context switches caused by the process giving up the CPU 215 | voluntarily. 216 | *exitcode*:: 217 | The program has exited normally with this exit code. 218 | *exitsig*:: 219 | The program has exited after receiving this fatal signal. 220 | *killed*:: 221 | Present when the program was terminated by the sandbox 222 | (e.g., because it has exceeded the time limit). 223 | *max-rss*:: 224 | Maximum resident set size of the process (in kilobytes). 225 | *message*:: 226 | Status message, not intended for machine processing. 227 | E.g., "Time limit exceeded." 228 | *status*:: 229 | Two-letter status code: 230 | * *RE* -- run-time error, i.e., exited with a non-zero exit code 231 | * *SG* -- program died on a signal 232 | * *TO* -- timed out 233 | * *XX* -- internal error of the sandbox 234 | *time*:: 235 | Run time of the program in fractional seconds. 236 | *time-wall*:: 237 | Wall clock time of the program in fractional seconds. 238 | 239 | RETURN VALUE 240 | ------------ 241 | When the program inside the sandbox finishes correctly, the sandbox returns 0. 242 | If it finishes incorrectly, it returns 1. 243 | All other return codes signal an internal error. 244 | 245 | INSTALLATION 246 | ------------ 247 | Isolate depends on several advanced features of the Linux kernel. Please 248 | make sure that your kernel supports 249 | PID namespaces (+CONFIG_PID_NS+), 250 | IPC namespaces (+CONFIG_IPC_NS+), and 251 | network namespaces (+CONFIG_NET_NS+). 252 | If you want to use control groups, you need 253 | the cpusets (+CONFIG_CPUSETS+), 254 | CPU accounting controller (+CONFIG_CGROUP_CPUACCT+), and 255 | memory resource controller (+CONFIG_MEMCG+). If your machine has swap enabled, 256 | you should also enable the swap controller (+CONFIG_MEMCG_SWAP+). 257 | 258 | Debian 7.x and newer require enabling the memory and swap cgroup controllers by 259 | adding the parameters "cgroup_enable=memory swapaccount=1" to the kernel 260 | command-line, which can be set using GRUB_CMDLINE_LINUX_DEFAULT in 261 | /etc/default/grub. 262 | 263 | Isolate is designed to run setuid to root. The sub-process inside the sandbox 264 | then switches to a non-privileged user ID (different for each *--box-id*). 265 | The range of UIDs available and several filesystem paths are embedded in the 266 | isolate's binary during compilation; please see +config.h+ in the source 267 | tree for description. 268 | 269 | Before you run isolate with control groups, you need to ensure that the cgroup 270 | filesystem is enabled and mounted. Most modern Linux distributions already 271 | provide cgroup support through a tmpfs mounted at /sys/fs/cgroup, with 272 | individual controllers mounted within subdirectories. 273 | 274 | REPRODUCIBILITY 275 | --------------- 276 | 277 | The reproducibility of results can be improved by tuning some kernel 278 | parameters: 279 | 280 | * Disable address space randomization: +sysctl kernel.randomize_va_space=0+. 281 | Address space randomization can affect timing, memory usage, and program 282 | behavior. This setting can be made persistent through /etc/sysctl.d/. 283 | 284 | * Disable dynamic CPU frequency scaling. This requires setting the cpufreq 285 | scaling governor to +performance+. The process for doing this varies between 286 | distributions. 287 | 288 | LICENSE 289 | ------- 290 | Isolate was written by Martin Mares and Bernard Blackham. 291 | It can be distributed and used under the terms of the GNU 292 | General Public License version 2. 293 | -------------------------------------------------------------------------------- /isolate.c: -------------------------------------------------------------------------------- 1 | /* 2 | * A Process Isolator based on Linux Containers 3 | * 4 | * (c) 2012-2015 Martin Mares 5 | * (c) 2012-2014 Bernard Blackham 6 | */ 7 | 8 | #define _GNU_SOURCE 9 | 10 | #include "config.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | /* May not be defined in older glibc headers */ 38 | #ifndef MS_PRIVATE 39 | #warning "Working around old glibc: no MS_PRIVATE" 40 | #define MS_PRIVATE (1 << 18) 41 | #endif 42 | #ifndef MS_REC 43 | #warning "Working around old glibc: no MS_REC" 44 | #define MS_REC (1 << 14) 45 | #endif 46 | 47 | #define NONRET __attribute__((noreturn)) 48 | #define UNUSED __attribute__((unused)) 49 | #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) 50 | 51 | static int timeout; /* milliseconds */ 52 | static int wall_timeout; 53 | static int extra_timeout; 54 | static int pass_environ; 55 | static int verbose; 56 | static int fsize_limit; 57 | static int memory_limit; 58 | static int stack_limit; 59 | static int block_quota; 60 | static int inode_quota; 61 | static int max_processes = 1; 62 | static char *redir_stdin, *redir_stdout, *redir_stderr; 63 | static char *set_cwd; 64 | static int share_net; 65 | 66 | static int cg_enable; 67 | static int cg_memory_limit; 68 | static int cg_timing; 69 | 70 | static int box_id; 71 | static char box_dir[1024]; 72 | static pid_t box_pid; 73 | 74 | static uid_t box_uid; 75 | static gid_t box_gid; 76 | static uid_t orig_uid; 77 | static gid_t orig_gid; 78 | 79 | static int partial_line; 80 | static int cleanup_ownership; 81 | 82 | static struct timeval start_time; 83 | static int ticks_per_sec; 84 | static int total_ms, wall_ms; 85 | static volatile sig_atomic_t timer_tick; 86 | 87 | static int error_pipes[2]; 88 | static int write_errors_to_fd; 89 | static int read_errors_from_fd; 90 | 91 | static void die(char *msg, ...) NONRET; 92 | static void cg_stats(void); 93 | static int get_wall_time_ms(void); 94 | static int get_run_time_ms(struct rusage *rus); 95 | 96 | static void chowntree(char *path, uid_t uid, gid_t gid); 97 | 98 | /*** Meta-files ***/ 99 | 100 | static FILE *metafile; 101 | 102 | static void 103 | meta_open(const char *name) 104 | { 105 | if (!strcmp(name, "-")) 106 | { 107 | metafile = stdout; 108 | return; 109 | } 110 | if (setfsuid(getuid()) < 0) 111 | die("Failed to switch FS UID: %m"); 112 | metafile = fopen(name, "w"); 113 | if (setfsuid(geteuid()) < 0) 114 | die("Failed to switch FS UID back: %m"); 115 | if (!metafile) 116 | die("Failed to open metafile '%s'",name); 117 | } 118 | 119 | static void 120 | meta_close(void) 121 | { 122 | if (metafile && metafile != stdout) 123 | fclose(metafile); 124 | } 125 | 126 | static void __attribute__((format(printf,1,2))) 127 | meta_printf(const char *fmt, ...) 128 | { 129 | if (!metafile) 130 | return; 131 | 132 | va_list args; 133 | va_start(args, fmt); 134 | vfprintf(metafile, fmt, args); 135 | va_end(args); 136 | } 137 | 138 | static void 139 | final_stats(struct rusage *rus) 140 | { 141 | total_ms = get_run_time_ms(rus); 142 | wall_ms = get_wall_time_ms(); 143 | 144 | meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000); 145 | meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000); 146 | meta_printf("max-rss:%ld\n", rus->ru_maxrss); 147 | meta_printf("csw-voluntary:%ld\n", rus->ru_nvcsw); 148 | meta_printf("csw-forced:%ld\n", rus->ru_nivcsw); 149 | 150 | cg_stats(); 151 | } 152 | 153 | /*** Messages and exits ***/ 154 | 155 | static void NONRET 156 | box_exit(int rc) 157 | { 158 | if (box_pid > 0) 159 | { 160 | kill(-box_pid, SIGKILL); 161 | kill(box_pid, SIGKILL); 162 | meta_printf("killed:1\n"); 163 | 164 | struct rusage rus; 165 | int p, stat; 166 | do 167 | p = wait4(box_pid, &stat, 0, &rus); 168 | while (p < 0 && errno == EINTR); 169 | if (p < 0) 170 | fprintf(stderr, "UGH: Lost track of the process (%m)\n"); 171 | else 172 | final_stats(&rus); 173 | } 174 | 175 | if (rc < 2 && cleanup_ownership) 176 | chowntree("box", orig_uid, orig_gid); 177 | 178 | meta_close(); 179 | exit(rc); 180 | } 181 | 182 | static void 183 | flush_line(void) 184 | { 185 | if (partial_line) 186 | fputc('\n', stderr); 187 | partial_line = 0; 188 | } 189 | 190 | /* Report an error of the sandbox itself */ 191 | static void NONRET __attribute__((format(printf,1,2))) 192 | die(char *msg, ...) 193 | { 194 | va_list args; 195 | va_start(args, msg); 196 | char buf[1024]; 197 | int n = vsnprintf(buf, sizeof(buf), msg, args); 198 | 199 | if (write_errors_to_fd) 200 | { 201 | // We are inside the box, have to use error pipe for error reporting. 202 | // We hope that the whole error message fits in PIPE_BUF bytes. 203 | write(write_errors_to_fd, buf, n); 204 | exit(2); 205 | } 206 | 207 | // Otherwise, we in the box keeper process, so we report errors normally 208 | flush_line(); 209 | meta_printf("status:XX\nmessage:%s\n", buf); 210 | fputs(buf, stderr); 211 | fputc('\n', stderr); 212 | box_exit(2); 213 | } 214 | 215 | /* Report an error of the program inside the sandbox */ 216 | static void NONRET __attribute__((format(printf,1,2))) 217 | err(char *msg, ...) 218 | { 219 | va_list args; 220 | va_start(args, msg); 221 | flush_line(); 222 | if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ') 223 | { 224 | meta_printf("status:%c%c\n", msg[0], msg[1]); 225 | msg += 4; 226 | } 227 | char buf[1024]; 228 | vsnprintf(buf, sizeof(buf), msg, args); 229 | meta_printf("message:%s\n", buf); 230 | fputs(buf, stderr); 231 | fputc('\n', stderr); 232 | box_exit(1); 233 | } 234 | 235 | /* Write a message, but only if in verbose mode */ 236 | static void __attribute__((format(printf,1,2))) 237 | msg(char *msg, ...) 238 | { 239 | va_list args; 240 | va_start(args, msg); 241 | if (verbose) 242 | { 243 | int len = strlen(msg); 244 | if (len > 0) 245 | partial_line = (msg[len-1] != '\n'); 246 | vfprintf(stderr, msg, args); 247 | fflush(stderr); 248 | } 249 | va_end(args); 250 | } 251 | 252 | /*** Utility functions ***/ 253 | 254 | static void * 255 | xmalloc(size_t size) 256 | { 257 | void *p = malloc(size); 258 | if (!p) 259 | die("Out of memory"); 260 | return p; 261 | } 262 | 263 | static char * 264 | xstrdup(char *str) 265 | { 266 | char *p = strdup(str); 267 | if (!p) 268 | die("Out of memory"); 269 | return p; 270 | } 271 | 272 | static int dir_exists(char *path) 273 | { 274 | struct stat st; 275 | return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); 276 | } 277 | 278 | static int rmtree_helper(const char *fpath, const struct stat *sb, 279 | int typeflag UNUSED, struct FTW *ftwbuf UNUSED) 280 | { 281 | if (S_ISDIR(sb->st_mode)) 282 | { 283 | if (rmdir(fpath) < 0) 284 | die("Cannot rmdir %s: %m", fpath); 285 | } 286 | else 287 | { 288 | if (unlink(fpath) < 0) 289 | die("Cannot unlink %s: %m", fpath); 290 | } 291 | return FTW_CONTINUE; 292 | } 293 | 294 | static void 295 | rmtree(char *path) 296 | { 297 | nftw(path, rmtree_helper, 32, FTW_MOUNT | FTW_PHYS | FTW_DEPTH); 298 | } 299 | 300 | static uid_t chown_uid; 301 | static gid_t chown_gid; 302 | 303 | static int chowntree_helper(const char *fpath, const struct stat *sb UNUSED, 304 | int typeflag UNUSED, struct FTW *ftwbuf UNUSED) 305 | { 306 | if (lchown(fpath, chown_uid, chown_gid) < 0) 307 | die("Cannot chown %s: %m", fpath); 308 | else 309 | return FTW_CONTINUE; 310 | } 311 | 312 | static void 313 | chowntree(char *path, uid_t uid, gid_t gid) 314 | { 315 | chown_uid = uid; 316 | chown_gid = gid; 317 | nftw(path, chowntree_helper, 32, FTW_MOUNT | FTW_PHYS); 318 | } 319 | 320 | /*** Environment rules ***/ 321 | 322 | struct env_rule { 323 | char *var; // Variable to match 324 | char *val; // ""=clear, NULL=inherit 325 | int var_len; 326 | struct env_rule *next; 327 | }; 328 | 329 | static struct env_rule *first_env_rule; 330 | static struct env_rule **last_env_rule = &first_env_rule; 331 | 332 | static struct env_rule default_env_rules[] = { 333 | { "LIBC_FATAL_STDERR_", "1" } 334 | }; 335 | 336 | static int 337 | set_env_action(char *a0) 338 | { 339 | struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1); 340 | char *a = (char *)(r+1); 341 | strcpy(a, a0); 342 | 343 | char *sep = strchr(a, '='); 344 | if (sep == a) 345 | return 0; 346 | r->var = a; 347 | if (sep) 348 | { 349 | *sep++ = 0; 350 | r->val = sep; 351 | } 352 | else 353 | r->val = NULL; 354 | *last_env_rule = r; 355 | last_env_rule = &r->next; 356 | r->next = NULL; 357 | return 1; 358 | } 359 | 360 | static int 361 | match_env_var(char *env_entry, struct env_rule *r) 362 | { 363 | if (strncmp(env_entry, r->var, r->var_len)) 364 | return 0; 365 | return (env_entry[r->var_len] == '='); 366 | } 367 | 368 | static void 369 | apply_env_rule(char **env, int *env_sizep, struct env_rule *r) 370 | { 371 | // First remove the variable if already set 372 | int pos = 0; 373 | while (pos < *env_sizep && !match_env_var(env[pos], r)) 374 | pos++; 375 | if (pos < *env_sizep) 376 | { 377 | (*env_sizep)--; 378 | env[pos] = env[*env_sizep]; 379 | env[*env_sizep] = NULL; 380 | } 381 | 382 | // What is the new value? 383 | char *new; 384 | if (r->val) 385 | { 386 | if (!r->val[0]) 387 | return; 388 | new = xmalloc(r->var_len + 1 + strlen(r->val) + 1); 389 | sprintf(new, "%s=%s", r->var, r->val); 390 | } 391 | else 392 | { 393 | pos = 0; 394 | while (environ[pos] && !match_env_var(environ[pos], r)) 395 | pos++; 396 | if (!(new = environ[pos])) 397 | return; 398 | } 399 | 400 | // Add it at the end of the array 401 | env[(*env_sizep)++] = new; 402 | env[*env_sizep] = NULL; 403 | } 404 | 405 | static char ** 406 | setup_environment(void) 407 | { 408 | // Link built-in rules with user rules 409 | for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--) 410 | { 411 | default_env_rules[i].next = first_env_rule; 412 | first_env_rule = &default_env_rules[i]; 413 | } 414 | 415 | // Scan the original environment 416 | char **orig_env = environ; 417 | int orig_size = 0; 418 | while (orig_env[orig_size]) 419 | orig_size++; 420 | 421 | // For each rule, reserve one more slot and calculate length 422 | int num_rules = 0; 423 | for (struct env_rule *r = first_env_rule; r; r=r->next) 424 | { 425 | num_rules++; 426 | r->var_len = strlen(r->var); 427 | } 428 | 429 | // Create a new environment 430 | char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *)); 431 | int size; 432 | if (pass_environ) 433 | { 434 | memcpy(env, environ, orig_size * sizeof(char *)); 435 | size = orig_size; 436 | } 437 | else 438 | size = 0; 439 | env[size] = NULL; 440 | 441 | // Apply the rules one by one 442 | for (struct env_rule *r = first_env_rule; r; r=r->next) 443 | apply_env_rule(env, &size, r); 444 | 445 | // Return the new env and pass some gossip 446 | if (verbose > 1) 447 | { 448 | fprintf(stderr, "Passing environment:\n"); 449 | for (int i=0; env[i]; i++) 450 | fprintf(stderr, "\t%s\n", env[i]); 451 | } 452 | return env; 453 | } 454 | 455 | /*** Directory rules ***/ 456 | 457 | struct dir_rule { 458 | char *inside; // A relative path 459 | char *outside; // This can be an absolute path or a relative path starting with "./" 460 | unsigned int flags; // DIR_FLAG_xxx 461 | struct dir_rule *next; 462 | }; 463 | 464 | enum dir_rule_flags { 465 | DIR_FLAG_RW = 1, 466 | DIR_FLAG_NOEXEC = 2, 467 | DIR_FLAG_FS = 4, 468 | DIR_FLAG_MAYBE = 8, 469 | DIR_FLAG_DEV = 16, 470 | }; 471 | 472 | static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev" }; 473 | 474 | static struct dir_rule *first_dir_rule; 475 | static struct dir_rule **last_dir_rule = &first_dir_rule; 476 | 477 | static int add_dir_rule(char *in, char *out, unsigned int flags) 478 | { 479 | // Make sure that "in" is relative 480 | while (in[0] == '/') 481 | in++; 482 | if (!*in) 483 | return 0; 484 | 485 | // Check "out" 486 | if (flags & DIR_FLAG_FS) 487 | { 488 | if (!out || out[0] == '/') 489 | return 0; 490 | } 491 | else 492 | { 493 | if (out && out[0] != '/' && strncmp(out, "./", 2)) 494 | return 0; 495 | } 496 | 497 | // Override an existing rule 498 | struct dir_rule *r; 499 | for (r = first_dir_rule; r; r = r->next) 500 | if (!strcmp(r->inside, in)) 501 | break; 502 | 503 | // Add a new rule 504 | if (!r) 505 | { 506 | r = xmalloc(sizeof(*r)); 507 | r->inside = in; 508 | *last_dir_rule = r; 509 | last_dir_rule = &r->next; 510 | r->next = NULL; 511 | } 512 | r->outside = out; 513 | r->flags = flags; 514 | return 1; 515 | } 516 | 517 | static unsigned int parse_dir_option(char *opt) 518 | { 519 | for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++) 520 | if (!strcmp(opt, dir_flag_names[i])) 521 | return 1U << i; 522 | die("Unknown directory option %s", opt); 523 | } 524 | 525 | static int set_dir_action(char *arg) 526 | { 527 | arg = xstrdup(arg); 528 | 529 | char *colon = strchr(arg, ':'); 530 | unsigned int flags = 0; 531 | while (colon) 532 | { 533 | *colon++ = 0; 534 | char *next = strchr(colon, ':'); 535 | if (next) 536 | *next = 0; 537 | flags |= parse_dir_option(colon); 538 | colon = next; 539 | } 540 | 541 | char *eq = strchr(arg, '='); 542 | if (eq) 543 | { 544 | *eq++ = 0; 545 | return add_dir_rule(arg, (*eq ? eq : NULL), flags); 546 | } 547 | else 548 | { 549 | char *out = xmalloc(1 + strlen(arg) + 1); 550 | sprintf(out, "/%s", arg); 551 | return add_dir_rule(arg, out, flags); 552 | } 553 | } 554 | 555 | static void init_dir_rules(void) 556 | { 557 | set_dir_action("box=./box:rw"); 558 | set_dir_action("bin"); 559 | set_dir_action("dev:dev"); 560 | set_dir_action("lib"); 561 | set_dir_action("lib64:maybe"); 562 | set_dir_action("proc=proc:fs"); 563 | set_dir_action("usr"); 564 | } 565 | 566 | static void make_dir(char *path) 567 | { 568 | char *sep = (path[0] == '/' ? path+1 : path); 569 | 570 | for (;;) 571 | { 572 | sep = strchr(sep, '/'); 573 | if (sep) 574 | *sep = 0; 575 | 576 | if (!dir_exists(path) && mkdir(path, 0777) < 0) 577 | die("Cannot create directory %s: %m\n", path); 578 | 579 | if (!sep) 580 | return; 581 | *sep++ = '/'; 582 | } 583 | } 584 | 585 | static void apply_dir_rules(void) 586 | { 587 | for (struct dir_rule *r = first_dir_rule; r; r=r->next) 588 | { 589 | char *in = r->inside; 590 | char *out = r->outside; 591 | if (!out) 592 | { 593 | msg("Not binding anything on %s\n", r->inside); 594 | continue; 595 | } 596 | 597 | if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out)) 598 | { 599 | msg("Not binding %s on %s (does not exist)\n", out, r->inside); 600 | continue; 601 | } 602 | 603 | char root_in[1024]; 604 | snprintf(root_in, sizeof(root_in), "root/%s", in); 605 | make_dir(root_in); 606 | 607 | unsigned long mount_flags = 0; 608 | if (!(r->flags & DIR_FLAG_RW)) 609 | mount_flags |= MS_RDONLY; 610 | if (r->flags & DIR_FLAG_NOEXEC) 611 | mount_flags |= MS_NOEXEC; 612 | if (!(r->flags & DIR_FLAG_DEV)) 613 | mount_flags |= MS_NODEV; 614 | 615 | if (r->flags & DIR_FLAG_FS) 616 | { 617 | msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags); 618 | if (mount("none", root_in, out, mount_flags, "") < 0) 619 | die("Cannot mount %s on %s: %m", out, in); 620 | } 621 | else 622 | { 623 | mount_flags |= MS_BIND | MS_NOSUID; 624 | msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags); 625 | // Most mount flags need remount to work 626 | if (mount(out, root_in, "none", mount_flags, "") < 0 || 627 | mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0) 628 | die("Cannot mount %s on %s: %m", out, in); 629 | } 630 | } 631 | } 632 | 633 | /*** Control groups ***/ 634 | 635 | struct cg_controller_desc { 636 | const char *name; 637 | int optional; 638 | }; 639 | 640 | typedef enum { 641 | CG_MEMORY = 0, 642 | CG_CPUACCT, 643 | CG_CPUSET, 644 | CG_NUM_CONTROLLERS, 645 | } cg_controller; 646 | 647 | static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { 648 | [CG_MEMORY] = { "memory", 0 }, 649 | [CG_CPUACCT] = { "cpuacct", 0 }, 650 | [CG_CPUSET] = { "cpuset", 1 }, 651 | [CG_NUM_CONTROLLERS] = { NULL, 0 }, 652 | }; 653 | 654 | #define FOREACH_CG_CONTROLLER(_controller) \ 655 | for (cg_controller (_controller) = 0; \ 656 | (_controller) < CG_NUM_CONTROLLERS; (_controller)++) 657 | 658 | static const char *cg_controller_name(cg_controller c) 659 | { 660 | return cg_controllers[c].name; 661 | } 662 | 663 | static int cg_controller_optional(cg_controller c) 664 | { 665 | return cg_controllers[c].optional; 666 | } 667 | 668 | static char cg_name[256]; 669 | 670 | #define CG_BUFSIZE 1024 671 | 672 | static void 673 | cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) 674 | { 675 | const char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; 676 | snprintf(buf, len, "%s/%s/%s/%s", cg_root, cg_controller_name(c), cg_name, attr); 677 | } 678 | 679 | static int 680 | cg_read(cg_controller controller, const char *attr, char *buf) 681 | { 682 | int result = 0; 683 | int maybe = 0; 684 | if (attr[0] == '?') 685 | { 686 | attr++; 687 | maybe = 1; 688 | } 689 | 690 | char path[256]; 691 | cg_makepath(path, sizeof(path), controller, attr); 692 | 693 | int fd = open(path, O_RDONLY); 694 | if (fd < 0) 695 | { 696 | if (maybe) 697 | goto fail; 698 | die("Cannot read %s: %m", path); 699 | } 700 | 701 | int n = read(fd, buf, CG_BUFSIZE); 702 | if (n < 0) 703 | { 704 | if (maybe) 705 | goto fail_close; 706 | die("Cannot read %s: %m", path); 707 | } 708 | if (n >= CG_BUFSIZE - 1) 709 | die("Attribute %s too long", path); 710 | if (n > 0 && buf[n-1] == '\n') 711 | n--; 712 | buf[n] = 0; 713 | 714 | if (verbose > 1) 715 | msg("CG: Read %s = %s\n", attr, buf); 716 | 717 | result = 1; 718 | fail_close: 719 | close(fd); 720 | fail: 721 | return result; 722 | } 723 | 724 | static void __attribute__((format(printf,3,4))) 725 | cg_write(cg_controller controller, const char *attr, const char *fmt, ...) 726 | { 727 | int maybe = 0; 728 | if (attr[0] == '?') 729 | { 730 | attr++; 731 | maybe = 1; 732 | } 733 | 734 | va_list args; 735 | va_start(args, fmt); 736 | 737 | char buf[CG_BUFSIZE]; 738 | int n = vsnprintf(buf, sizeof(buf), fmt, args); 739 | if (n >= CG_BUFSIZE) 740 | die("cg_write: Value for attribute %s is too long", attr); 741 | 742 | if (verbose > 1) 743 | msg("CG: Write %s = %s", attr, buf); 744 | 745 | char path[256]; 746 | cg_makepath(path, sizeof(path), controller, attr); 747 | 748 | int fd = open(path, O_WRONLY | O_TRUNC); 749 | if (fd < 0) 750 | { 751 | if (maybe) 752 | goto fail; 753 | else 754 | die("Cannot write %s: %m", path); 755 | } 756 | 757 | int written = write(fd, buf, n); 758 | if (written < 0) 759 | { 760 | if (maybe) 761 | goto fail_close; 762 | else 763 | die("Cannot set %s to %s: %m", path, buf); 764 | } 765 | if (written != n) 766 | die("Short write to %s (%d out of %d bytes)", path, written, n); 767 | 768 | fail_close: 769 | close(fd); 770 | fail: 771 | va_end(args); 772 | } 773 | 774 | static void 775 | cg_init(void) 776 | { 777 | if (!cg_enable) 778 | return; 779 | 780 | char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; 781 | if (!dir_exists(cg_root)) 782 | die("Control group filesystem at %s not mounted", cg_root); 783 | 784 | snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); 785 | msg("Using control group %s\n", cg_name); 786 | } 787 | 788 | static void 789 | cg_prepare(void) 790 | { 791 | if (!cg_enable) 792 | return; 793 | 794 | struct stat st; 795 | char buf[CG_BUFSIZE]; 796 | char path[256]; 797 | 798 | FOREACH_CG_CONTROLLER(controller) 799 | { 800 | cg_makepath(path, sizeof(path), controller, ""); 801 | if (stat(path, &st) >= 0 || errno != ENOENT) 802 | { 803 | msg("Control group %s already exists, trying to empty it.\n", path); 804 | if (rmdir(path) < 0) 805 | die("Failed to reset control group %s: %m", path); 806 | } 807 | 808 | if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) 809 | die("Failed to create control group %s: %m", path); 810 | } 811 | 812 | // If cpuset module is enabled, copy allowed cpus and memory nodes from parent group 813 | if (cg_read(CG_CPUSET, "?cpuset.cpus", buf)) 814 | cg_write(CG_CPUSET, "cpuset.cpus", "%s", buf); 815 | if (cg_read(CG_CPUSET, "?cpuset.mems", buf)) 816 | cg_write(CG_CPUSET, "cpuset.mems", "%s", buf); 817 | } 818 | 819 | static void 820 | cg_enter(void) 821 | { 822 | if (!cg_enable) 823 | return; 824 | 825 | msg("Entering control group %s\n", cg_name); 826 | 827 | FOREACH_CG_CONTROLLER(controller) 828 | { 829 | if (cg_controller_optional(controller)) 830 | cg_write(controller, "?tasks", "%d\n", (int) getpid()); 831 | else 832 | cg_write(controller, "tasks", "%d\n", (int) getpid()); 833 | } 834 | 835 | if (cg_memory_limit) 836 | { 837 | cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); 838 | cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); 839 | } 840 | 841 | if (cg_timing) 842 | cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); 843 | } 844 | 845 | static int 846 | cg_get_run_time_ms(void) 847 | { 848 | if (!cg_enable) 849 | return 0; 850 | 851 | char buf[CG_BUFSIZE]; 852 | cg_read(CG_CPUACCT, "cpuacct.usage", buf); 853 | unsigned long long ns = atoll(buf); 854 | return ns / 1000000; 855 | } 856 | 857 | static void 858 | cg_stats(void) 859 | { 860 | if (!cg_enable) 861 | return; 862 | 863 | char buf[CG_BUFSIZE]; 864 | 865 | // Memory usage statistics 866 | unsigned long long mem=0, memsw=0; 867 | if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) 868 | mem = atoll(buf); 869 | if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) 870 | { 871 | memsw = atoll(buf); 872 | if (memsw > mem) 873 | mem = memsw; 874 | } 875 | if (mem) 876 | meta_printf("cg-mem:%lld\n", mem >> 10); 877 | } 878 | 879 | static void 880 | cg_remove(void) 881 | { 882 | char buf[CG_BUFSIZE]; 883 | 884 | if (!cg_enable) 885 | return; 886 | 887 | FOREACH_CG_CONTROLLER(controller) 888 | { 889 | if (cg_controller_optional(controller)) 890 | { 891 | if (!cg_read(controller, "?tasks", buf)) 892 | continue; 893 | } 894 | else 895 | cg_read(controller, "tasks", buf); 896 | 897 | if (buf[0]) 898 | die("Some tasks left in controller %s of cgroup %s, failed to remove it", 899 | cg_controller_name(controller), cg_name); 900 | 901 | char path[256]; 902 | cg_makepath(path, sizeof(path), controller, ""); 903 | 904 | if (rmdir(path) < 0) 905 | die("Cannot remove control group %s: %m", path); 906 | } 907 | } 908 | 909 | /*** Disk quotas ***/ 910 | 911 | static int 912 | path_begins_with(char *path, char *with) 913 | { 914 | while (*with) 915 | if (*path++ != *with++) 916 | return 0; 917 | return (!*with || *with == '/'); 918 | } 919 | 920 | static char * 921 | find_device(char *path) 922 | { 923 | FILE *f = setmntent("/proc/mounts", "r"); 924 | if (!f) 925 | die("Cannot open /proc/mounts: %m"); 926 | 927 | struct mntent *me; 928 | int best_len = 0; 929 | char *best_dev = NULL; 930 | while (me = getmntent(f)) 931 | { 932 | if (!path_begins_with(me->mnt_fsname, "/dev")) 933 | continue; 934 | if (path_begins_with(path, me->mnt_dir)) 935 | { 936 | int len = strlen(me->mnt_dir); 937 | if (len > best_len) 938 | { 939 | best_len = len; 940 | free(best_dev); 941 | best_dev = xstrdup(me->mnt_fsname); 942 | } 943 | } 944 | } 945 | endmntent(f); 946 | return best_dev; 947 | } 948 | 949 | static void 950 | set_quota(void) 951 | { 952 | if (!block_quota) 953 | return; 954 | 955 | char cwd[PATH_MAX]; 956 | if (!getcwd(cwd, sizeof(cwd))) 957 | die("getcwd: %m"); 958 | 959 | char *dev = find_device(cwd); 960 | if (!dev) 961 | die("Cannot identify filesystem which contains %s", cwd); 962 | msg("Quota: Mapped path %s to a filesystem on %s\n", cwd, dev); 963 | 964 | // Sanity check 965 | struct stat dev_st, cwd_st; 966 | if (stat(dev, &dev_st) < 0) 967 | die("Cannot identify block device %s: %m", dev); 968 | if (!S_ISBLK(dev_st.st_mode)) 969 | die("Expected that %s is a block device", dev); 970 | if (stat(".", &cwd_st) < 0) 971 | die("Cannot stat cwd: %m"); 972 | if (cwd_st.st_dev != dev_st.st_rdev) 973 | die("Identified %s as a filesystem on %s, but it is obviously false", cwd, dev); 974 | 975 | struct dqblk dq = { 976 | .dqb_bhardlimit = block_quota, 977 | .dqb_bsoftlimit = block_quota, 978 | .dqb_ihardlimit = inode_quota, 979 | .dqb_isoftlimit = inode_quota, 980 | .dqb_valid = QIF_LIMITS, 981 | }; 982 | if (quotactl(QCMD(Q_SETQUOTA, USRQUOTA), dev, box_uid, (caddr_t) &dq) < 0) 983 | die("Cannot set disk quota: %m"); 984 | msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); 985 | 986 | free(dev); 987 | } 988 | 989 | /*** The keeper process ***/ 990 | 991 | static void 992 | signal_alarm(int unused UNUSED) 993 | { 994 | /* Time limit checks are synchronous, so we only schedule them there. */ 995 | timer_tick = 1; 996 | alarm(1); 997 | } 998 | 999 | static void 1000 | signal_int(int signum) 1001 | { 1002 | /* Interrupts are fatal, so no synchronization requirements. */ 1003 | meta_printf("exitsig:%d\n", signum); 1004 | err("SG: Interrupted"); 1005 | } 1006 | 1007 | #define PROC_BUF_SIZE 4096 1008 | static void 1009 | read_proc_file(char *buf, char *name, int *fdp) 1010 | { 1011 | int c; 1012 | 1013 | if (!*fdp) 1014 | { 1015 | sprintf(buf, "/proc/%d/%s", (int) box_pid, name); 1016 | *fdp = open(buf, O_RDONLY); 1017 | if (*fdp < 0) 1018 | die("open(%s): %m", buf); 1019 | } 1020 | lseek(*fdp, 0, SEEK_SET); 1021 | if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0) 1022 | die("read on /proc/$pid/%s: %m", name); 1023 | if (c >= PROC_BUF_SIZE-1) 1024 | die("/proc/$pid/%s too long", name); 1025 | buf[c] = 0; 1026 | } 1027 | 1028 | static int 1029 | get_wall_time_ms(void) 1030 | { 1031 | struct timeval now, wall; 1032 | gettimeofday(&now, NULL); 1033 | timersub(&now, &start_time, &wall); 1034 | return wall.tv_sec*1000 + wall.tv_usec/1000; 1035 | } 1036 | 1037 | static int 1038 | get_run_time_ms(struct rusage *rus) 1039 | { 1040 | if (cg_timing) 1041 | return cg_get_run_time_ms(); 1042 | 1043 | if (rus) 1044 | { 1045 | struct timeval total; 1046 | timeradd(&rus->ru_utime, &rus->ru_stime, &total); 1047 | return total.tv_sec*1000 + total.tv_usec/1000; 1048 | } 1049 | 1050 | char buf[PROC_BUF_SIZE], *x; 1051 | int utime, stime; 1052 | static int proc_stat_fd; 1053 | 1054 | read_proc_file(buf, "stat", &proc_stat_fd); 1055 | x = buf; 1056 | while (*x && *x != ' ') 1057 | x++; 1058 | while (*x == ' ') 1059 | x++; 1060 | if (*x++ != '(') 1061 | die("proc stat syntax error 1"); 1062 | while (*x && (*x != ')' || x[1] != ' ')) 1063 | x++; 1064 | while (*x == ')' || *x == ' ') 1065 | x++; 1066 | if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2) 1067 | die("proc stat syntax error 2"); 1068 | 1069 | return (utime + stime) * 1000 / ticks_per_sec; 1070 | } 1071 | 1072 | static void 1073 | check_timeout(void) 1074 | { 1075 | if (wall_timeout) 1076 | { 1077 | int wall_ms = get_wall_time_ms(); 1078 | if (wall_ms > wall_timeout) 1079 | err("TO: Time limit exceeded (wall clock)"); 1080 | if (verbose > 1) 1081 | fprintf(stderr, "[wall time check: %d msec]\n", wall_ms); 1082 | } 1083 | if (timeout) 1084 | { 1085 | int ms = get_run_time_ms(NULL); 1086 | if (verbose > 1) 1087 | fprintf(stderr, "[time check: %d msec]\n", ms); 1088 | if (ms > timeout && ms > extra_timeout) 1089 | err("TO: Time limit exceeded"); 1090 | } 1091 | } 1092 | 1093 | static void 1094 | box_keeper(void) 1095 | { 1096 | read_errors_from_fd = error_pipes[0]; 1097 | close(error_pipes[1]); 1098 | 1099 | struct sigaction sa; 1100 | bzero(&sa, sizeof(sa)); 1101 | sa.sa_handler = signal_int; 1102 | sigaction(SIGHUP, &sa, NULL); 1103 | sigaction(SIGINT, &sa, NULL); 1104 | sigaction(SIGQUIT, &sa, NULL); 1105 | sigaction(SIGILL, &sa, NULL); 1106 | sigaction(SIGABRT, &sa, NULL); 1107 | sigaction(SIGFPE, &sa, NULL); 1108 | sigaction(SIGSEGV, &sa, NULL); 1109 | sigaction(SIGPIPE, &sa, NULL); 1110 | sigaction(SIGTERM, &sa, NULL); 1111 | sigaction(SIGUSR1, &sa, NULL); 1112 | sigaction(SIGUSR2, &sa, NULL); 1113 | 1114 | gettimeofday(&start_time, NULL); 1115 | ticks_per_sec = sysconf(_SC_CLK_TCK); 1116 | if (ticks_per_sec <= 0) 1117 | die("Invalid ticks_per_sec!"); 1118 | 1119 | if (timeout || wall_timeout) 1120 | { 1121 | sa.sa_handler = signal_alarm; 1122 | sigaction(SIGALRM, &sa, NULL); 1123 | alarm(1); 1124 | } 1125 | 1126 | for(;;) 1127 | { 1128 | struct rusage rus; 1129 | int stat; 1130 | pid_t p; 1131 | if (timer_tick) 1132 | { 1133 | check_timeout(); 1134 | timer_tick = 0; 1135 | } 1136 | p = wait4(box_pid, &stat, 0, &rus); 1137 | if (p < 0) 1138 | { 1139 | if (errno == EINTR) 1140 | continue; 1141 | die("wait4: %m"); 1142 | } 1143 | if (p != box_pid) 1144 | die("wait4: unknown pid %d exited!", p); 1145 | box_pid = 0; 1146 | 1147 | // Check error pipe if there is an internal error passed from inside the box 1148 | char interr[1024]; 1149 | int n = read(read_errors_from_fd, interr, sizeof(interr) - 1); 1150 | if (n > 0) 1151 | { 1152 | interr[n] = 0; 1153 | die("%s", interr); 1154 | } 1155 | 1156 | if (WIFEXITED(stat)) 1157 | { 1158 | final_stats(&rus); 1159 | if (WEXITSTATUS(stat)) 1160 | { 1161 | meta_printf("exitcode:%d\n", WEXITSTATUS(stat)); 1162 | err("RE: Exited with error status %d", WEXITSTATUS(stat)); 1163 | } 1164 | if (timeout && total_ms > timeout) 1165 | err("TO: Time limit exceeded"); 1166 | if (wall_timeout && wall_ms > wall_timeout) 1167 | err("TO: Time limit exceeded (wall clock)"); 1168 | flush_line(); 1169 | fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n", 1170 | total_ms/1000, total_ms%1000, 1171 | wall_ms/1000, wall_ms%1000); 1172 | box_exit(0); 1173 | } 1174 | else if (WIFSIGNALED(stat)) 1175 | { 1176 | meta_printf("exitsig:%d\n", WTERMSIG(stat)); 1177 | final_stats(&rus); 1178 | err("SG: Caught fatal signal %d", WTERMSIG(stat)); 1179 | } 1180 | else if (WIFSTOPPED(stat)) 1181 | { 1182 | meta_printf("exitsig:%d\n", WSTOPSIG(stat)); 1183 | final_stats(&rus); 1184 | err("SG: Stopped by signal %d", WSTOPSIG(stat)); 1185 | } 1186 | else 1187 | die("wait4: unknown status %x, giving up!", stat); 1188 | } 1189 | } 1190 | 1191 | /*** The process running inside the box ***/ 1192 | 1193 | static void 1194 | setup_root(void) 1195 | { 1196 | if (mkdir("root", 0750) < 0 && errno != EEXIST) 1197 | die("mkdir('root'): %m"); 1198 | 1199 | /* 1200 | * Ensure all mounts are private, not shared. We don't want our mounts 1201 | * appearing outside of our namespace. 1202 | * (systemd since version 188 mounts filesystems shared by default). 1203 | */ 1204 | if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) 1205 | die("Cannot privatize mounts: %m"); 1206 | 1207 | if (mount("none", "root", "tmpfs", 0, "mode=755") < 0) 1208 | die("Cannot mount root ramdisk: %m"); 1209 | 1210 | apply_dir_rules(); 1211 | 1212 | if (chroot("root") < 0) 1213 | die("Chroot failed: %m"); 1214 | 1215 | if (chdir("root/box") < 0) 1216 | die("Cannot change current directory: %m"); 1217 | } 1218 | 1219 | static void 1220 | setup_credentials(void) 1221 | { 1222 | if (setresgid(box_gid, box_gid, box_gid) < 0) 1223 | die("setresgid: %m"); 1224 | if (setgroups(0, NULL) < 0) 1225 | die("setgroups: %m"); 1226 | if (setresuid(box_uid, box_uid, box_uid) < 0) 1227 | die("setresuid: %m"); 1228 | setpgrp(); 1229 | } 1230 | 1231 | static void 1232 | setup_fds(void) 1233 | { 1234 | if (redir_stdin) 1235 | { 1236 | close(0); 1237 | if (open(redir_stdin, O_RDONLY) != 0) 1238 | die("open(\"%s\"): %m", redir_stdin); 1239 | } 1240 | if (redir_stdout) 1241 | { 1242 | close(1); 1243 | if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1) 1244 | die("open(\"%s\"): %m", redir_stdout); 1245 | } 1246 | if (redir_stderr) 1247 | { 1248 | close(2); 1249 | if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2) 1250 | die("open(\"%s\"): %m", redir_stderr); 1251 | } 1252 | else 1253 | dup2(1, 2); 1254 | } 1255 | 1256 | static void 1257 | setup_rlim(const char *res_name, int res, rlim_t limit) 1258 | { 1259 | struct rlimit rl = { .rlim_cur = limit, .rlim_max = limit }; 1260 | if (setrlimit(res, &rl) < 0) 1261 | die("setrlimit(%s, %jd)", res_name, (intmax_t) limit); 1262 | } 1263 | 1264 | static void 1265 | setup_rlimits(void) 1266 | { 1267 | #define RLIM(res, val) setup_rlim("RLIMIT_" #res, RLIMIT_##res, val) 1268 | 1269 | if (memory_limit) 1270 | RLIM(AS, (rlim_t)memory_limit * 1024); 1271 | 1272 | if (fsize_limit) 1273 | RLIM(FSIZE, (rlim_t)fsize_limit * 1024); 1274 | 1275 | RLIM(STACK, (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY)); 1276 | RLIM(NOFILE, 64); 1277 | RLIM(MEMLOCK, 0); 1278 | 1279 | if (max_processes) 1280 | RLIM(NPROC, max_processes); 1281 | 1282 | #undef RLIM 1283 | } 1284 | 1285 | static int 1286 | box_inside(void *arg) 1287 | { 1288 | char **args = arg; 1289 | write_errors_to_fd = error_pipes[1]; 1290 | close(error_pipes[0]); 1291 | meta_close(); 1292 | 1293 | cg_enter(); 1294 | setup_root(); 1295 | setup_credentials(); 1296 | setup_fds(); 1297 | setup_rlimits(); 1298 | char **env = setup_environment(); 1299 | 1300 | if (set_cwd && chdir(set_cwd)) 1301 | die("chdir: %m"); 1302 | 1303 | execve(args[0], args, env); 1304 | die("execve(\"%s\"): %m", args[0]); 1305 | } 1306 | 1307 | static void 1308 | box_init(void) 1309 | { 1310 | if (box_id < 0 || box_id >= CONFIG_ISOLATE_NUM_BOXES) 1311 | die("Sandbox ID out of range (allowed: 0-%d)", CONFIG_ISOLATE_NUM_BOXES-1); 1312 | box_uid = CONFIG_ISOLATE_FIRST_UID + box_id; 1313 | box_gid = CONFIG_ISOLATE_FIRST_GID + box_id; 1314 | 1315 | snprintf(box_dir, sizeof(box_dir), "%s/%d", CONFIG_ISOLATE_BOX_DIR, box_id); 1316 | make_dir(box_dir); 1317 | if (chdir(box_dir) < 0) 1318 | die("chdir(%s): %m", box_dir); 1319 | } 1320 | 1321 | /*** Commands ***/ 1322 | 1323 | static void 1324 | init(void) 1325 | { 1326 | msg("Preparing sandbox directory\n"); 1327 | rmtree("box"); 1328 | if (mkdir("box", 0700) < 0) 1329 | die("Cannot create box: %m"); 1330 | if (chown("box", orig_uid, orig_gid) < 0) 1331 | die("Cannot chown box: %m"); 1332 | 1333 | cg_prepare(); 1334 | set_quota(); 1335 | } 1336 | 1337 | static void 1338 | cleanup(void) 1339 | { 1340 | if (!dir_exists("box")) 1341 | die("Box directory not found, there isn't anything to clean up"); 1342 | 1343 | msg("Deleting sandbox directory\n"); 1344 | rmtree(box_dir); 1345 | cg_remove(); 1346 | } 1347 | 1348 | static void 1349 | run(char **argv) 1350 | { 1351 | if (!dir_exists("box")) 1352 | die("Box directory not found, did you run `isolate --init'?"); 1353 | 1354 | chowntree("box", box_uid, box_gid); 1355 | cleanup_ownership = 1; 1356 | 1357 | if (pipe(error_pipes) < 0) 1358 | die("pipe: %m"); 1359 | for (int i=0; i<2; i++) 1360 | if (fcntl(error_pipes[i], F_SETFD, fcntl(error_pipes[i], F_GETFD) | FD_CLOEXEC) < 0 || 1361 | fcntl(error_pipes[i], F_SETFL, fcntl(error_pipes[i], F_GETFL) | O_NONBLOCK) < 0) 1362 | die("fcntl on pipe: %m"); 1363 | 1364 | box_pid = clone( 1365 | box_inside, // Function to execute as the body of the new process 1366 | argv, // Pass our stack 1367 | SIGCHLD | CLONE_NEWIPC | (share_net ? 0 : CLONE_NEWNET) | CLONE_NEWNS | CLONE_NEWPID, 1368 | argv); // Pass the arguments 1369 | if (box_pid < 0) 1370 | die("clone: %m"); 1371 | if (!box_pid) 1372 | die("clone returned 0"); 1373 | box_keeper(); 1374 | } 1375 | 1376 | static void 1377 | show_version(void) 1378 | { 1379 | printf("The process isolator " VERSION "\n"); 1380 | printf("(c) 2012--" YEAR " Martin Mares and Bernard Blackham\n"); 1381 | printf("Built on " BUILD_DATE " from Git commit " BUILD_COMMIT "\n"); 1382 | printf("\nCompile-time configuration:\n"); 1383 | printf("Sandbox directory: %s\n", CONFIG_ISOLATE_BOX_DIR); 1384 | printf("Sandbox credentials: uid=%u-%u gid=%u-%u\n", 1385 | CONFIG_ISOLATE_FIRST_UID, 1386 | CONFIG_ISOLATE_FIRST_UID + CONFIG_ISOLATE_NUM_BOXES - 1, 1387 | CONFIG_ISOLATE_FIRST_GID, 1388 | CONFIG_ISOLATE_FIRST_GID + CONFIG_ISOLATE_NUM_BOXES - 1); 1389 | } 1390 | 1391 | /*** Options ***/ 1392 | 1393 | static void __attribute__((format(printf,1,2))) 1394 | usage(const char *msg, ...) 1395 | { 1396 | if (msg != NULL) 1397 | { 1398 | va_list args; 1399 | va_start(args, msg); 1400 | vfprintf(stderr, msg, args); 1401 | va_end(args); 1402 | } 1403 | printf("\ 1404 | Usage: isolate [] \n\ 1405 | \n\ 1406 | Options:\n\ 1407 | -b, --box-id=\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\ 1408 | --cg\t\tEnable use of control groups\n\ 1409 | --cg-mem=\tLimit memory usage of the control group to KB\n\ 1410 | --cg-timing\t\tTime limits affects total run time of the control group\n\ 1411 | -c, --chdir=\tChange directory to before executing the program\n\ 1412 | -d, --dir=\t\tMake a directory visible inside the sandbox\n\ 1413 | --dir==\tMake a directory outside visible as inside\n\ 1414 | --dir==\t\tDelete a previously defined directory rule (even a default one)\n\ 1415 | --dir=...:\tSpecify options for a rule:\n\ 1416 | \t\t\t\tdev\tAllow access to special files\n\ 1417 | \t\t\t\tfs\tMount a filesystem (e.g., --dir=/proc:proc:fs)\n\ 1418 | \t\t\t\tmaybe\tSkip the rule if does not exist\n\ 1419 | \t\t\t\tnoexec\tDo not allow execution of binaries\n\ 1420 | \t\t\t\trw\tAllow read-write access\n\ 1421 | -f, --fsize=\tMax size (in KB) of files that can be created\n\ 1422 | -E, --env=\t\tInherit the environment variable from the parent process\n\ 1423 | -E, --env==\tSet the environment variable to ; unset it if is empty\n\ 1424 | -x, --extra-time=