├── .gitignore ├── LICENSE ├── README.md ├── cmd └── runprog │ ├── array_flags.go │ ├── config │ ├── config.go │ ├── config_amd64.go │ ├── config_arm.go │ ├── config_arm64.go │ ├── config_loader.go │ └── config_type.go │ ├── fileutil.go │ ├── main.go │ ├── main_darwin.go │ └── main_linux.go ├── container ├── benchmark_linux_test.go ├── consts_linux.go ├── container_cmd_linux.go ├── container_exec_linux.go ├── container_init_linux.go ├── doc.go ├── environment_linux.go ├── host_cmd_linux.go ├── host_exec_linux.go ├── lookup_linux.go ├── protocol_linux.go ├── signal_linux.go ├── signal_linux_mips64x.go ├── socket_linux.go └── utils.go ├── go.mod ├── go.sum ├── pkg ├── cgroup │ ├── benchmark_linux_test.go │ ├── cgroup_info_linux.go │ ├── cgroup_linux.go │ ├── consts_linux.go │ ├── doc.go │ ├── utils_linux.go │ ├── v1_linux.go │ ├── v1controller_linux.go │ └── v2_linux.go ├── forkexec │ ├── bench_linux_test.go │ ├── clone3_linux.go │ ├── consts_linux.go │ ├── doc.go │ ├── errloc_linux.go │ ├── fork_child_darwin.go │ ├── fork_child_linux.go │ ├── fork_darwin.go │ ├── fork_linux.go │ ├── fork_linux_test.go │ ├── fork_unix.go │ ├── fork_util.go │ ├── runner_darwin.go │ ├── runner_linux.go │ ├── sandbox_darwin_test.go │ ├── sandbox_load_darwin.go │ ├── syscall_darwin.go │ ├── test.sb │ ├── userns_linux.go │ ├── vfork │ │ ├── asm_linux_386.s │ │ ├── asm_linux_amd64.s │ │ ├── asm_linux_arm.s │ │ ├── asm_linux_arm64.s │ │ ├── asm_linux_loong64.s │ │ ├── asm_linux_mips64x.s │ │ ├── asm_linux_mipsx.s │ │ ├── asm_linux_ppc64x.s │ │ ├── asm_linux_riscv64.s │ │ ├── asm_linux_s390x.s │ │ └── syscall.go │ ├── zsyscall_darwin.go │ └── zsyscall_darwin.s ├── memfd │ ├── doc.go │ ├── memfd_linux.go │ ├── memfd_linux_test.go │ └── memfd_other.go ├── mount │ ├── builder.go │ ├── builder_linux.go │ ├── builder_linux_test.go │ ├── doc.go │ ├── mount.go │ ├── mount_linux.go │ └── mount_linux_test.go ├── pipe │ ├── buffer.go │ └── buffer_test.go ├── rlimit │ ├── rlimit.go │ └── rlimit_test.go ├── seccomp │ ├── filter_linux.go │ └── libseccomp │ │ ├── action.go │ │ ├── action_linux.go │ │ ├── builder_linux.go │ │ ├── doc.go │ │ ├── seccomp_linux_test.go │ │ └── syscall_name_linux.go └── unixsocket │ ├── benchmark_linux_test.go │ ├── socket_linux.go │ └── socket_linux_test.go ├── ptracer ├── context_helper_linux.go ├── context_helper_linux_test.go ├── context_linux.go ├── context_linux_amd64.go ├── context_linux_arm.go ├── context_linux_arm64.go ├── context_other.go ├── doc.go ├── ptrace_linux.go ├── tracer.go └── tracer_track_linux.go └── runner ├── doc.go ├── limit.go ├── ptrace ├── filehandler │ ├── fileset.go │ ├── fileset_test.go │ ├── handle.go │ └── syscallcounter.go ├── handle_linux.go ├── run_linux.go └── runner_linux.go ├── result.go ├── runner.go ├── size.go ├── status.go └── unshare ├── doc.go ├── run_linux.go └── runner_linux.go /.gitignore: -------------------------------------------------------------------------------- 1 | # OS 2 | .DS_Store 3 | 4 | # Test Env 5 | test*/ 6 | env*.sh 7 | 8 | # Test Files 9 | /runprog 10 | /test 11 | 12 | .vscode 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 criyle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cmd/runprog/array_flags.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | 5 | type arrayFlags []string 6 | 7 | func (f *arrayFlags) String() string { 8 | return fmt.Sprint([]string(*f)) 9 | } 10 | 11 | func (f *arrayFlags) Set(value string) error { 12 | *f = append(*f, value) 13 | return nil 14 | } 15 | -------------------------------------------------------------------------------- /cmd/runprog/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // This file includes configs for the run program settings 4 | 5 | var ( 6 | // default read permission files 7 | defaultReadableFiles = []string{ 8 | "/etc/ld.so.nohwcap", 9 | "/etc/ld.so.preload", 10 | "/etc/ld.so.cache", 11 | "/usr/lib/locale/locale-archive", 12 | "/proc/self/exe", 13 | "/etc/timezone", 14 | "/usr/share/zoneinfo/", 15 | "/dev/random", 16 | "/dev/urandom", 17 | "/proc/meminfo", 18 | "/etc/localtime", 19 | } 20 | 21 | // default write permission files 22 | defaultWritableFiles = []string{"/dev/null"} 23 | 24 | // default allowed safe syscalls 25 | defaultSyscallAllows = []string{ 26 | // file access through fd 27 | "read", 28 | "write", 29 | "readv", 30 | "writev", 31 | "close", 32 | "fstat", 33 | "lseek", 34 | "dup", 35 | "dup2", 36 | "dup3", 37 | "ioctl", 38 | "fcntl", 39 | "fadvise64", 40 | "pread64", 41 | "pwrite64", 42 | 43 | // memory action 44 | "mmap", 45 | "mprotect", 46 | "munmap", 47 | "brk", 48 | "mremap", 49 | "msync", 50 | "mincore", 51 | "madvise", 52 | 53 | // signal action 54 | "rt_sigaction", 55 | "rt_sigprocmask", 56 | "rt_sigreturn", 57 | "rt_sigpending", 58 | "sigaltstack", 59 | 60 | // get current work dir 61 | "getcwd", 62 | 63 | // process exit 64 | "exit", 65 | "exit_group", 66 | 67 | // others 68 | "arch_prctl", 69 | 70 | "gettimeofday", 71 | "getrlimit", 72 | "getrusage", 73 | "times", 74 | "time", 75 | "clock_gettime", 76 | 77 | "restart_syscall", 78 | } 79 | 80 | // default syscalls to trace 81 | defaultSyscallTraces = []string{ 82 | // execute file 83 | "execve", 84 | "execveat", 85 | 86 | // file open 87 | "open", 88 | "openat", 89 | 90 | // file delete 91 | "unlink", 92 | "unlinkat", 93 | 94 | // soft link 95 | "readlink", 96 | "readlinkat", 97 | 98 | // permission check 99 | "lstat", 100 | "stat", 101 | "access", 102 | "faccessat", 103 | } 104 | 105 | // process related syscall if allowProc enabled 106 | defaultProcSyscalls = []string{"clone", "fork", "vfork", "nanosleep", "execve"} 107 | 108 | // config for different type of program 109 | // workpath and arg0 have additional read / stat permission 110 | runptraceConfig = map[string]ProgramConfig{ 111 | "python2.7": { 112 | Syscall: SyscallConfig{ 113 | ExtraAllow: []string{ 114 | "futex", "getdents", "getdents64", "prlimit64", "getpid", "sysinfo", 115 | }, 116 | ExtraCount: map[string]int{ 117 | "set_tid_address": 1, 118 | "set_robust_list": 1, 119 | }, 120 | }, 121 | FileAccess: FileAccessConfig{ 122 | ExtraRead: []string{ 123 | "/usr/bin/python2.7", 124 | "/usr/lib/python2.7/", 125 | "/usr/bin/lib/python2.7/", 126 | "/usr/local/lib/python2.7/", 127 | "/usr/lib/pymodules/python2.7/", 128 | "/usr/bin/Modules/", 129 | "/usr/bin/pybuilddir.txt", 130 | "/usr/lib/locale/", 131 | "./answer.code", 132 | }, 133 | ExtraStat: []string{ 134 | "/usr", "/usr/bin", 135 | }, 136 | }, 137 | RunCommand: []string{"/usr/bin/python2.7", "-E", "-s", "-B"}, 138 | }, 139 | "python3": { 140 | Syscall: SyscallConfig{ 141 | ExtraAllow: []string{ 142 | "futex", "getdents", "getdents64", "prlimit64", "getpid", "sysinfo", "getrandom", 143 | }, 144 | ExtraCount: map[string]int{ 145 | "set_tid_address": 1, 146 | "set_robust_list": 1, 147 | }, 148 | }, 149 | FileAccess: FileAccessConfig{ 150 | ExtraRead: []string{ 151 | "/usr/bin/python3", 152 | "/usr/lib/python3/", 153 | "/usr/bin/python3.6", 154 | "/usr/lib/python3.6/", 155 | "/usr/bin/lib/python3.6/", 156 | "/usr/local/lib/python3.6/", 157 | "/usr/bin/pyvenv.cfg", 158 | "/usr/pyvenv.cfg", 159 | "/usr/bin/Modules", 160 | "/usr/bin/pybuilddir.txt", 161 | "/usr/lib/dist-python", 162 | "/usr/lib/locale/", 163 | "./answer.code", 164 | }, 165 | ExtraStat: []string{ 166 | "/usr", "/usr/bin", "/usr/lib", "/usr/lib/python36.zip", 167 | }, 168 | }, 169 | RunCommand: []string{"/usr/bin/python3", "-I", "-B"}, 170 | }, 171 | "compiler": { 172 | Syscall: SyscallConfig{ 173 | ExtraAllow: []string{ 174 | "gettid", "set_tid_address", "set_robust_list", "futex", 175 | "getpid", "vfork", "fork", "clone", "execve", "wait4", 176 | "clock_gettime", "clock_getres", 177 | "setrlimit", "pipe", 178 | "getdents64", "getdents", 179 | "umask", "rename", "chmod", "mkdir", 180 | "chdir", "fchdir", 181 | "ftruncate", 182 | "sched_getaffinity", "sched_yield", 183 | "uname", "sysinfo", 184 | "prlimit64", "getrandom", 185 | "fchmodat", 186 | }, 187 | ExtraBan: []string{"socket", "connect", "geteuid", "getuid"}, 188 | }, 189 | FileAccess: FileAccessConfig{ 190 | ExtraWrite: []string{ 191 | "/tmp/", "./", 192 | }, 193 | ExtraRead: []string{ 194 | "./", 195 | "../runtime/", 196 | "/etc/oracle/java/usagetracker.properties", 197 | "/usr/", 198 | "/lib/", 199 | "/lib64/", 200 | "/bin/", 201 | "/sbin/", 202 | "/sys/devices/system/cpu/", 203 | "/proc/", 204 | "/etc/timezone", 205 | "/etc/fpc-2.6.2.cfg.d/", 206 | "/etc/fpc.cfg", 207 | "/*", 208 | "/", // system_root 209 | }, 210 | ExtraBan: []string{ 211 | "/etc/nsswitch.conf", 212 | "/etc/passwd", 213 | }, 214 | }, 215 | }, 216 | } 217 | ) 218 | -------------------------------------------------------------------------------- /cmd/runprog/config/config_amd64.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // This file includes configs for the run program settings 4 | 5 | var ( 6 | archReadableFiles = []string{ 7 | "/lib/x86_64-linux-gnu/", 8 | "/usr/lib/x86_64-linux-gnu/", 9 | } 10 | 11 | archSyscallAllows = []string{} 12 | 13 | archSyscallTraces = []string{} 14 | ) 15 | -------------------------------------------------------------------------------- /cmd/runprog/config/config_arm.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // This file includes configs for the run program settings 4 | 5 | var ( 6 | archReadableFiles = []string{ 7 | "/lib/arm-linux-gnueabihf/", 8 | "/usr/lib/arm-linux-gnueabihf/", 9 | } 10 | 11 | archSyscallAllows = []string{ 12 | "fstat64", // 32-bit 13 | "_llseek", // 32-bit 14 | "fcntl64", // 32-bit 15 | "mmap2", // 32-bit 16 | // arch 17 | "uname", 18 | "set_tls", 19 | "arm_fadvise64_64", 20 | } 21 | 22 | archSyscallTraces = []string{ 23 | "lstat64", // 32-bit 24 | "stat64", // 32-bit 25 | } 26 | ) 27 | -------------------------------------------------------------------------------- /cmd/runprog/config/config_arm64.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // This file includes configs for the run program settings 4 | 5 | var ( 6 | archReadableFiles = []string{ 7 | "/lib/aarch64-linux-gnu/", 8 | "/usr/lib/aarch64-linux-gnu/", 9 | } 10 | 11 | archSyscallAllows = []string{ 12 | "newfstatat", 13 | } 14 | 15 | archSyscallTraces = []string{} 16 | ) 17 | -------------------------------------------------------------------------------- /cmd/runprog/config/config_loader.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import "github.com/criyle/go-sandbox/runner/ptrace/filehandler" 4 | 5 | // GetConf return file access check set, syscall counter, allow and traced syscall arrays and new args 6 | func GetConf(pType, workPath string, args, addRead, addWrite []string, 7 | allowProc bool) ([]string, []string, []string, *filehandler.Handler) { 8 | var ( 9 | fs = filehandler.NewFileSets() 10 | sc = filehandler.NewSyscallCounter() 11 | allow = append(append([]string{}, defaultSyscallAllows...), archSyscallAllows...) 12 | trace = append(append([]string{}, defaultSyscallTraces...), archSyscallTraces...) 13 | ) 14 | 15 | fs.Readable.AddRange(defaultReadableFiles, workPath) 16 | fs.Readable.AddRange(archReadableFiles, workPath) 17 | fs.Writable.AddRange(defaultWritableFiles, workPath) 18 | fs.AddFilePermission(args[0], filehandler.FilePermRead) 19 | fs.AddFilePermission(workPath, filehandler.FilePermRead) 20 | 21 | fs.Readable.AddRange(addRead, workPath) 22 | fs.Writable.AddRange(addWrite, workPath) 23 | 24 | if c, o := runptraceConfig[pType]; o { 25 | allow = append(allow, c.Syscall.ExtraAllow...) 26 | trace = append(trace, c.Syscall.ExtraBan...) 27 | sc.AddRange(c.Syscall.ExtraCount) 28 | fs.Readable.AddRange(c.FileAccess.ExtraRead, workPath) 29 | fs.Writable.AddRange(c.FileAccess.ExtraWrite, workPath) 30 | fs.Statable.AddRange(c.FileAccess.ExtraStat, workPath) 31 | fs.SoftBan.AddRange(c.FileAccess.ExtraBan, workPath) 32 | args = append(c.RunCommand, args...) 33 | } 34 | if allowProc { 35 | allow = append(allow, defaultProcSyscalls...) 36 | } 37 | allow, trace = cleanTrace(allow, trace) 38 | 39 | return args, allow, trace, &filehandler.Handler{ 40 | FileSet: fs, 41 | SyscallCounter: sc, 42 | } 43 | } 44 | 45 | func keySetToSlice(m map[string]bool) []string { 46 | rt := make([]string, 0, len(m)) 47 | for k := range m { 48 | rt = append(rt, k) 49 | } 50 | return rt 51 | } 52 | 53 | func cleanTrace(allow, trace []string) ([]string, []string) { 54 | // make sure allow, trace no duplicate 55 | traceMap := make(map[string]bool) 56 | for _, s := range trace { 57 | traceMap[s] = true 58 | } 59 | allowMap := make(map[string]bool) 60 | for _, s := range allow { 61 | if !traceMap[s] { 62 | allowMap[s] = true 63 | } 64 | } 65 | return keySetToSlice(allowMap), keySetToSlice(traceMap) 66 | } 67 | -------------------------------------------------------------------------------- /cmd/runprog/config/config_type.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // ProgramConfig defines the extra config apply to program type 4 | type ProgramConfig struct { 5 | Syscall SyscallConfig 6 | FileAccess FileAccessConfig 7 | RunCommand []string 8 | } 9 | 10 | // SyscallConfig defines extra syscallConfig apply to program type 11 | type SyscallConfig struct { 12 | ExtraAllow, ExtraBan []string 13 | ExtraCount map[string]int 14 | } 15 | 16 | // FileAccessConfig defines extra file access permission for the program type 17 | type FileAccessConfig struct { 18 | ExtraRead, ExtraWrite, ExtraStat, ExtraBan []string 19 | } 20 | -------------------------------------------------------------------------------- /cmd/runprog/fileutil.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "os" 4 | 5 | // prepareFile opens file for new process 6 | func prepareFiles(inputFile, outputFile, errorFile string) ([]*os.File, error) { 7 | var err error 8 | files := make([]*os.File, 3) 9 | if inputFile != "" { 10 | files[0], err = os.OpenFile(inputFile, os.O_RDONLY, 0755) 11 | if err != nil { 12 | goto openerror 13 | } 14 | } 15 | if outputFile != "" { 16 | files[1], err = os.OpenFile(outputFile, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0755) 17 | if err != nil { 18 | goto openerror 19 | } 20 | } 21 | if errorFile != "" { 22 | files[2], err = os.OpenFile(errorFile, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0755) 23 | if err != nil { 24 | goto openerror 25 | } 26 | } 27 | return files, nil 28 | openerror: 29 | closeFiles(files) 30 | return nil, err 31 | } 32 | 33 | // closeFiles close all file in the list 34 | func closeFiles(files []*os.File) { 35 | for _, f := range files { 36 | if f != nil { 37 | f.Close() 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /cmd/runprog/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/criyle/go-sandbox/runner" 9 | ) 10 | 11 | const ( 12 | pathEnv = "PATH=/usr/local/bin:/usr/bin:/bin" 13 | ) 14 | 15 | func printUsage() { 16 | fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [options] \n", os.Args[0]) 17 | flag.PrintDefaults() 18 | os.Exit(2) 19 | } 20 | 21 | // Status defines uoj/run_program constants 22 | type Status int 23 | 24 | // UOJ run_program constants 25 | const ( 26 | StatusNormal Status = iota // 0 27 | StatusInvalid // 1 28 | StatusRE // 2 29 | StatusMLE // 3 30 | StatusTLE // 4 31 | StatusOLE // 5 32 | StatusBan // 6 33 | StatusFatal // 7 34 | ) 35 | 36 | func getStatus(s runner.Status) int { 37 | switch s { 38 | case runner.StatusNormal: 39 | return int(StatusNormal) 40 | case runner.StatusInvalid: 41 | return int(StatusInvalid) 42 | case runner.StatusTimeLimitExceeded: 43 | return int(StatusTLE) 44 | case runner.StatusMemoryLimitExceeded: 45 | return int(StatusMLE) 46 | case runner.StatusOutputLimitExceeded: 47 | return int(StatusOLE) 48 | case runner.StatusDisallowedSyscall: 49 | return int(StatusBan) 50 | case runner.StatusSignalled, runner.StatusNonzeroExitStatus: 51 | return int(StatusRE) 52 | default: 53 | return int(StatusFatal) 54 | } 55 | } 56 | 57 | func debug(v ...interface{}) { 58 | if showDetails { 59 | fmt.Fprintln(os.Stderr, v...) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /cmd/runprog/main_darwin.go: -------------------------------------------------------------------------------- 1 | // Command runprog executes program defined restricted environment including seccomp-ptraced, namespaced and containerized. 2 | package main 3 | 4 | import ( 5 | "flag" 6 | "fmt" 7 | "os" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/criyle/go-sandbox/pkg/forkexec" 12 | "github.com/criyle/go-sandbox/pkg/rlimit" 13 | "github.com/criyle/go-sandbox/runner" 14 | "golang.org/x/sys/unix" 15 | ) 16 | 17 | var ( 18 | timeLimit, realTimeLimit, memoryLimit, outputLimit, stackLimit uint64 19 | inputFileName, outputFileName, errorFileName, workPath string 20 | 21 | profilePath, result string 22 | showDetails bool 23 | 24 | args []string 25 | ) 26 | 27 | func main() { 28 | flag.Usage = printUsage 29 | flag.Uint64Var(&timeLimit, "tl", 1, "Set time limit (in second)") 30 | flag.Uint64Var(&realTimeLimit, "rtl", 0, "Set real time limit (in second)") 31 | flag.Uint64Var(&memoryLimit, "ml", 256, "Set memory limit (in mb)") 32 | flag.Uint64Var(&outputLimit, "ol", 64, "Set output limit (in mb)") 33 | flag.Uint64Var(&stackLimit, "sl", 32, "Set stack limit (in mb)") 34 | flag.StringVar(&inputFileName, "in", "", "Set input file name") 35 | flag.StringVar(&outputFileName, "out", "", "Set output file name") 36 | flag.StringVar(&errorFileName, "err", "", "Set error file name") 37 | flag.StringVar(&workPath, "work-path", "", "Set the work path of the program") 38 | flag.StringVar(&profilePath, "p", "", "sandbox profile") 39 | flag.BoolVar(&showDetails, "show-trace-details", false, "Show trace details") 40 | flag.StringVar(&result, "res", "stdout", "Set the file name for output the result") 41 | flag.Parse() 42 | 43 | args = flag.Args() 44 | if len(args) == 0 { 45 | printUsage() 46 | } 47 | 48 | if realTimeLimit < timeLimit { 49 | realTimeLimit = timeLimit + 2 50 | } 51 | if stackLimit > memoryLimit { 52 | stackLimit = memoryLimit 53 | } 54 | if workPath == "" { 55 | workPath, _ = os.Getwd() 56 | } 57 | 58 | var ( 59 | f *os.File 60 | err error 61 | ) 62 | if result == "stdout" { 63 | f = os.Stdout 64 | } else if result == "stderr" { 65 | f = os.Stderr 66 | } else { 67 | f, err = os.Create(result) 68 | if err != nil { 69 | debug("Failed to open result file:", err) 70 | return 71 | } 72 | defer f.Close() 73 | } 74 | 75 | rt, err := start() 76 | debug(rt, err) 77 | if e, ok := err.(syscall.Errno); ok { 78 | debug("errno", int(e)) 79 | } 80 | 81 | if rt == nil { 82 | rt = &runner.Result{ 83 | Status: runner.StatusRunnerError, 84 | } 85 | } 86 | if err == nil && rt.Status != runner.StatusNormal { 87 | err = rt.Status 88 | } 89 | debug("setupTime: ", rt.SetUpTime) 90 | debug("runningTime: ", rt.RunningTime) 91 | if err != nil { 92 | debug(err) 93 | c, ok := err.(runner.Status) 94 | if !ok { 95 | c = runner.StatusRunnerError 96 | } 97 | // Handle fatal error from trace 98 | fmt.Fprintf(f, "%d %d %d %d\n", getStatus(c), int(rt.Time/time.Millisecond), uint64(rt.Memory)>>10, rt.ExitStatus) 99 | if c == runner.StatusRunnerError { 100 | os.Exit(1) 101 | } 102 | } else { 103 | fmt.Fprintf(f, "%d %d %d %d\n", 0, int(rt.Time/time.Millisecond), uint64(rt.Memory)>>10, rt.ExitStatus) 104 | } 105 | } 106 | 107 | func start() (*runner.Result, error) { 108 | var sTime, mTime, fTime time.Time 109 | sTime = time.Now() 110 | files, err := prepareFiles(inputFileName, outputFileName, errorFileName) 111 | if err != nil { 112 | return nil, err 113 | } 114 | defer closeFiles(files) 115 | 116 | var profile string 117 | if profilePath != "" { 118 | c, err := os.ReadFile(profilePath) 119 | if err != nil { 120 | return nil, fmt.Errorf("profile: %w", err) 121 | } 122 | profile = string(c) 123 | } 124 | 125 | // if not defined, then use the original value 126 | fds := make([]uintptr, len(files)) 127 | for i, f := range files { 128 | if f != nil { 129 | fds[i] = f.Fd() 130 | } else { 131 | fds[i] = uintptr(i) 132 | } 133 | } 134 | 135 | rlims := rlimit.RLimits{ 136 | CPU: timeLimit, 137 | CPUHard: realTimeLimit, 138 | FileSize: outputLimit << 20, 139 | Data: memoryLimit << 20, 140 | AddressSpace: memoryLimit << 20, 141 | Stack: stackLimit << 20, 142 | } 143 | 144 | debug(rlims) 145 | debug(args) 146 | 147 | r := forkexec.Runner{ 148 | Args: args, 149 | Env: []string{pathEnv}, 150 | RLimits: rlims.PrepareRLimit(), 151 | Files: fds, 152 | WorkDir: workPath, 153 | SandboxProfile: profile, 154 | SyncFunc: func(pid int) error { 155 | mTime = time.Now() 156 | return nil 157 | }, 158 | } 159 | pid, err := r.Start() 160 | if err != nil { 161 | return nil, err 162 | } 163 | 164 | defer func() { 165 | killAll(pid) 166 | collectZombie(pid) 167 | }() 168 | 169 | var ( 170 | wstatus syscall.WaitStatus 171 | rusage syscall.Rusage 172 | ) 173 | for { 174 | _, err = syscall.Wait4(pid, &wstatus, 0, &rusage) 175 | if err == syscall.EINTR { 176 | continue 177 | } 178 | fTime = time.Now() 179 | if err != nil { 180 | return nil, err 181 | } 182 | result := runner.Result{ 183 | Status: runner.StatusNormal, 184 | Time: time.Duration(rusage.Utime.Nano()), 185 | Memory: runner.Size(rusage.Maxrss), // seems MacOS uses bytes instead of kb 186 | SetUpTime: mTime.Sub(sTime), 187 | RunningTime: fTime.Sub(mTime), 188 | } 189 | if uint64(result.Time) > timeLimit*1e9 { 190 | result.Status = runner.StatusTimeLimitExceeded 191 | } 192 | if uint64(result.Memory) > memoryLimit<<20 { 193 | result.Status = runner.StatusMemoryLimitExceeded 194 | } 195 | 196 | switch { 197 | case wstatus.Exited(): 198 | if status := wstatus.ExitStatus(); status != 0 { 199 | result.Status = runner.StatusNonzeroExitStatus 200 | } 201 | return &result, nil 202 | 203 | case wstatus.Signaled(): 204 | sig := wstatus.Signal() 205 | switch sig { 206 | case unix.SIGXCPU, unix.SIGKILL: 207 | result.Status = runner.StatusTimeLimitExceeded 208 | case unix.SIGXFSZ: 209 | result.Status = runner.StatusOutputLimitExceeded 210 | case unix.SIGSYS: 211 | result.Status = runner.StatusDisallowedSyscall 212 | default: 213 | result.Status = runner.StatusSignalled 214 | } 215 | result.ExitStatus = int(sig) 216 | return &result, nil 217 | } 218 | } 219 | } 220 | 221 | // kill all tracee according to pids 222 | func killAll(pgid int) { 223 | unix.Kill(-pgid, unix.SIGKILL) 224 | } 225 | 226 | // collect died child processes 227 | func collectZombie(pgid int) { 228 | var wstatus unix.WaitStatus 229 | for { 230 | if _, err := unix.Wait4(-pgid, &wstatus, unix.WNOHANG, nil); err != unix.EINTR && err != nil { 231 | break 232 | } 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /container/benchmark_linux_test.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "os" 7 | "runtime" 8 | "syscall" 9 | "testing" 10 | 11 | "github.com/criyle/go-sandbox/runner" 12 | ) 13 | 14 | func init() { 15 | Init() 16 | } 17 | 18 | func BenchmarkContainer(b *testing.B) { 19 | tmpDir, err := os.MkdirTemp("", "") 20 | if err != nil { 21 | b.Error(err) 22 | } 23 | builder := &Builder{ 24 | Root: tmpDir, 25 | Stderr: os.Stderr, 26 | } 27 | n := runtime.GOMAXPROCS(0) 28 | ch := make(chan Environment, n) 29 | for i := 0; i < n; i++ { 30 | m, err := builder.Build() 31 | if err != nil { 32 | b.Error(err) 33 | } 34 | b.Cleanup(func() { 35 | m.Destroy() 36 | }) 37 | ch <- m 38 | } 39 | b.ResetTimer() 40 | b.RunParallel(func(pb *testing.PB) { 41 | m := <-ch 42 | for pb.Next() { 43 | r := m.Execve(context.TODO(), ExecveParam{ 44 | Args: []string{"/bin/true"}, 45 | Env: []string{"PATH=/bin"}, 46 | }) 47 | if r.Status != runner.StatusNormal { 48 | b.Error(r.Status, r.Error) 49 | } 50 | } 51 | }) 52 | } 53 | 54 | type testCase struct { 55 | name string 56 | param ExecveParam 57 | expected runner.Status 58 | } 59 | 60 | var err error = errors.New("test error") 61 | 62 | var successParam = ExecveParam{ 63 | Args: []string{"/bin/true"}, 64 | Env: []string{"PATH=/bin"}, 65 | } 66 | 67 | var tests []testCase = []testCase{ 68 | { 69 | name: "Success", 70 | param: successParam, 71 | expected: runner.StatusNormal, 72 | }, 73 | { 74 | name: "SuccessWithSync", 75 | param: ExecveParam{ 76 | Args: []string{"/bin/true"}, 77 | Env: []string{"PATH=/bin"}, 78 | SyncFunc: func(p int) error { return nil }, 79 | }, 80 | expected: runner.StatusNormal, 81 | }, 82 | { 83 | name: "NotExists", 84 | param: ExecveParam{ 85 | Args: []string{"not_exists"}, 86 | Env: []string{"PATH=/bin"}, 87 | }, 88 | expected: runner.StatusRunnerError, 89 | }, 90 | { 91 | name: "NotExistsWithSync", 92 | param: ExecveParam{ 93 | Args: []string{"not_exists"}, 94 | Env: []string{"PATH=/bin"}, 95 | SyncFunc: func(p int) error { return nil }, 96 | }, 97 | expected: runner.StatusRunnerError, 98 | }, 99 | { 100 | name: "SyncFuncFail", 101 | param: ExecveParam{ 102 | Args: []string{"/bin/true"}, 103 | Env: []string{"PATH=/bin"}, 104 | SyncFunc: func(pid int) error { 105 | return err 106 | }, 107 | }, 108 | expected: runner.StatusRunnerError, 109 | }, 110 | { 111 | name: "SyncFuncFailAfterExec", 112 | param: ExecveParam{ 113 | Args: []string{"/bin/true"}, 114 | Env: []string{"PATH=/bin"}, 115 | SyncFunc: func(pid int) error { 116 | return err 117 | }, 118 | SyncAfterExec: true, 119 | }, 120 | expected: runner.StatusRunnerError, 121 | }, 122 | } 123 | 124 | type credgen struct{} 125 | 126 | func (c credgen) Get() syscall.Credential { 127 | return syscall.Credential{ 128 | Uid: 10000, 129 | Gid: 10000, 130 | } 131 | } 132 | 133 | func TestContainerSetCred(t *testing.T) { 134 | t.Parallel() 135 | if os.Getpid() != 1 { 136 | t.Skip("root required for this test") 137 | } 138 | runTest(t, successParam, runner.StatusNormal, credgen{}) 139 | } 140 | 141 | func runTest(t *testing.T, param ExecveParam, expected runner.Status, credGen CredGenerator) { 142 | t.Parallel() 143 | m := getEnv(t, credGen) 144 | r := m.Execve(context.TODO(), param) 145 | if r.Status != expected { 146 | t.Fatal(r.Status, r.Error, r) 147 | } 148 | if err := m.Ping(); err != nil { 149 | t.Fatal(err) 150 | } 151 | // can also success once more (no protocol mismatch) 152 | r = m.Execve(context.TODO(), successParam) 153 | if r.Status != runner.StatusNormal { 154 | t.Fatal(r.Status, r.Error, r) 155 | } 156 | } 157 | 158 | func TestCases(t *testing.T) { 159 | for _, c := range tests { 160 | t.Run(c.name, func(t *testing.T) { 161 | runTest(t, c.param, c.expected, nil) 162 | }) 163 | } 164 | } 165 | 166 | func getEnv(t *testing.T, credGen CredGenerator) Environment { 167 | tmpDir, err := os.MkdirTemp("", "") 168 | if err != nil { 169 | t.Fatal(err) 170 | } 171 | t.Cleanup(func() { 172 | os.Remove(tmpDir) 173 | }) 174 | builder := &Builder{ 175 | Root: tmpDir, 176 | CredGenerator: credGen, 177 | Stderr: os.Stderr, 178 | } 179 | m, err := builder.Build() 180 | if err != nil { 181 | t.Fatal(err) 182 | } 183 | t.Cleanup(func() { 184 | m.Destroy() 185 | }) 186 | return m 187 | } 188 | -------------------------------------------------------------------------------- /container/consts_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | type cmdType int8 4 | 5 | const ( 6 | cmdPing cmdType = iota + 1 7 | cmdOpen 8 | cmdDelete 9 | cmdReset 10 | cmdExecve 11 | cmdOk 12 | cmdKill 13 | cmdConf 14 | 15 | initArg = "container_init" 16 | 17 | containerUID = 1000 18 | containerGID = 1000 19 | 20 | containerName = "go-sandbox" 21 | containerWD = "/w" 22 | 23 | containerMaxProc = 1 24 | ) 25 | 26 | var defaultSymLinks = []SymbolicLink{ 27 | {LinkPath: "/dev/fd", Target: "/proc/self/fd"}, 28 | {LinkPath: "/dev/stdin", Target: "/proc/self/fd/0"}, 29 | {LinkPath: "/dev/stdout", Target: "/proc/self/fd/1"}, 30 | {LinkPath: "/dev/stderr", Target: "/proc/self/fd/2"}, 31 | } 32 | 33 | var defaultMaskPaths = []string{ 34 | // https://github.com/containerd/containerd/blob/f0a32c66dad1e9de716c9960af806105d691cd78/oci/spec.go#L165-L176 35 | "/proc/acpi", 36 | "/proc/asound", 37 | "/proc/kcore", 38 | "/proc/keys", 39 | "/proc/latency_stats", 40 | "/proc/timer_list", 41 | "/proc/timer_stats", 42 | "/proc/sched_debug", 43 | "/sys/firmware", 44 | "/proc/scsi", 45 | 46 | "/usr/lib/wsl", 47 | } 48 | -------------------------------------------------------------------------------- /container/container_cmd_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | "strings" 10 | 11 | "github.com/criyle/go-sandbox/pkg/unixsocket" 12 | ) 13 | 14 | func (c *containerServer) handlePing() error { 15 | return c.sendReply(reply{}, unixsocket.Msg{}) 16 | } 17 | 18 | func (c *containerServer) handleConf(conf *confCmd) error { 19 | if conf != nil { 20 | c.containerConfig = conf.Conf 21 | if err := initContainer(conf.Conf); err != nil { 22 | return err 23 | } 24 | if c.ContainerUID == 0 { 25 | c.ContainerUID = containerUID 26 | } 27 | if c.ContainerGID == 0 { 28 | c.ContainerGID = containerGID 29 | } 30 | env, err := readDotEnv() 31 | if err != nil { 32 | return err 33 | } 34 | c.defaultEnv = env 35 | } 36 | return c.sendReply(reply{}, unixsocket.Msg{}) 37 | } 38 | 39 | func (c *containerServer) handleOpen(open []OpenCmd) error { 40 | if len(open) == 0 { 41 | return c.sendErrorReply("open: no open parameter received") 42 | } 43 | 44 | // open files 45 | fds := make([]int, 0, len(open)) 46 | fileToClose := make([]*os.File, 0, len(open)) // let sendMsg close these files 47 | for _, o := range open { 48 | outFile, err := os.OpenFile(o.Path, o.Flag, o.Perm) 49 | if err != nil { 50 | for _, f := range fileToClose { 51 | f.Close() 52 | } 53 | return c.sendErrorReply("open: %v", err) 54 | } 55 | fileToClose = append(fileToClose, outFile) 56 | fds = append(fds, int(outFile.Fd())) 57 | } 58 | 59 | return c.sendReplyFiles(reply{}, unixsocket.Msg{Fds: fds}, fileToClose) 60 | } 61 | 62 | func (c *containerServer) handleDelete(delete *deleteCmd) error { 63 | if delete == nil { 64 | return c.sendErrorReply("delete: no parameter provided") 65 | } 66 | if err := os.Remove(delete.Path); err != nil { 67 | return c.sendErrorReply("delete: %v", err) 68 | } 69 | return c.sendReply(reply{}, unixsocket.Msg{}) 70 | } 71 | 72 | func (c *containerServer) handleReset() error { 73 | for _, m := range c.Mounts { 74 | if !m.IsTmpFs() { 75 | continue 76 | } 77 | if err := removeContents(filepath.Join("/", m.Target)); err != nil { 78 | return c.sendErrorReply("reset: %v %v", m.Target, err) 79 | } 80 | } 81 | return c.sendReply(reply{}, unixsocket.Msg{}) 82 | } 83 | 84 | // readDotEnv attempts to read /.env file and save as default environment variables 85 | func readDotEnv() ([]string, error) { 86 | f, err := os.Open("/.env") 87 | if err != nil { 88 | if errors.Is(err, os.ErrNotExist) { 89 | return nil, nil 90 | } 91 | return nil, fmt.Errorf("dotenv: open /.env: %w", err) 92 | } 93 | defer f.Close() 94 | 95 | var ret []string 96 | scanner := bufio.NewScanner(f) 97 | for scanner.Scan() { 98 | line := strings.TrimSpace(scanner.Text()) 99 | if len(line) == 0 || strings.HasPrefix(line, "#") { 100 | continue 101 | } 102 | if !strings.Contains(line, "=") { 103 | return nil, fmt.Errorf("dotenv: invalid line: %s", line) 104 | } 105 | ret = append(ret, line) 106 | } 107 | return ret, nil 108 | } 109 | -------------------------------------------------------------------------------- /container/container_exec_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "fmt" 5 | "syscall" 6 | "time" 7 | 8 | "github.com/criyle/go-sandbox/pkg/forkexec" 9 | "github.com/criyle/go-sandbox/pkg/unixsocket" 10 | "github.com/criyle/go-sandbox/runner" 11 | ) 12 | 13 | func (c *containerServer) handleExecve(cmd *execCmd, msg unixsocket.Msg) error { 14 | var ( 15 | files []uintptr 16 | execFile uintptr 17 | cgroupFd uintptr 18 | cred *syscall.Credential 19 | ) 20 | if cmd == nil { 21 | return c.sendErrorReply("handle: no parameter provided") 22 | } 23 | if len(msg.Fds) > 0 { 24 | files = intSliceToUintptr(msg.Fds) 25 | // don't leak fds to child 26 | closeOnExecFds(msg.Fds) 27 | // release files after execve 28 | defer closeFds(msg.Fds) 29 | } 30 | 31 | // if fexecve, then the first fd must be executable 32 | if cmd.FdExec { 33 | if len(files) == 0 { 34 | return c.sendErrorReply("handle: expected fexecve fd") 35 | } 36 | execFile = files[0] 37 | files = files[1:] 38 | } 39 | // if cgroupFd, then the cgroupFd follows 40 | if cmd.FdCgroup { 41 | if len(files) == 0 { 42 | return c.sendErrorReply("handle: expected cgroup fd") 43 | } 44 | cgroupFd = files[0] 45 | files = files[1:] 46 | } 47 | 48 | var env []string 49 | env = append(env, c.defaultEnv...) 50 | env = append(env, cmd.Env...) 51 | 52 | if len(cmd.Argv) > 0 { 53 | exePath, err := lookPath(cmd.Argv[0], env) 54 | if err != nil { 55 | return c.sendErrorReply("handle: %s: %v", cmd.Argv[0], err) 56 | } 57 | cmd.Argv[0] = exePath 58 | } 59 | 60 | syncPid := func(pid int) error { 61 | msg := unixsocket.Msg{ 62 | Cred: &syscall.Ucred{ 63 | Pid: int32(pid), 64 | Uid: uint32(syscall.Getuid()), 65 | Gid: uint32(syscall.Getgid()), 66 | }, 67 | } 68 | if err := c.sendReply(reply{}, msg); err != nil { 69 | return fmt.Errorf("sync func: send reply: %w", err) 70 | } 71 | cmd, _, err := c.recvCmd() 72 | if err != nil { 73 | return fmt.Errorf("sync func: recv cmd: %w", err) 74 | } 75 | if cmd.Cmd == cmdKill { 76 | return fmt.Errorf("sync func: received kill") 77 | } 78 | return nil 79 | } 80 | var syncFunc func(pid int) error 81 | if !cmd.SyncAfter { 82 | syncFunc = syncPid 83 | } 84 | 85 | if c.Cred { 86 | cred = &syscall.Credential{ 87 | Uid: uint32(c.ContainerUID), 88 | Gid: uint32(c.ContainerGID), 89 | NoSetGroups: true, 90 | } 91 | } 92 | 93 | var seccomp *syscall.SockFprog 94 | if cmd.Seccomp != nil { 95 | seccomp = cmd.Seccomp.SockFprog() 96 | } 97 | 98 | r := forkexec.Runner{ 99 | Args: cmd.Argv, 100 | Env: env, 101 | ExecFile: execFile, 102 | RLimits: cmd.RLimits, 103 | Files: files, 104 | WorkDir: c.WorkDir, 105 | NoNewPrivs: true, 106 | DropCaps: true, 107 | SyncFunc: syncFunc, 108 | Credential: cred, 109 | CTTY: cmd.CTTY, 110 | Seccomp: seccomp, 111 | CgroupFd: cgroupFd, 112 | 113 | UnshareCgroupAfterSync: c.UnshareCgroup, 114 | } 115 | // starts the runner, error is handled same as wait4 to make communication equal 116 | pid, err := r.Start() 117 | if err != nil { 118 | s := "" 119 | if len(cmd.Argv) > 0 { 120 | s = cmd.Argv[0] 121 | } 122 | return c.sendErrorReply("start: %s: %v", s, err) 123 | } 124 | if cmd.SyncAfter { 125 | if err := syncPid(1); err != nil { 126 | syscall.Kill(-1, syscall.SIGKILL) 127 | 128 | c.waitPid <- pid 129 | ret := <-c.waitPidResult 130 | err := c.sendReply(convertReply(ret), unixsocket.Msg{}) 131 | 132 | c.waitAll <- struct{}{} 133 | <-c.waitAllDone 134 | return err 135 | } 136 | } 137 | return c.handleExecveStarted(pid) 138 | } 139 | 140 | func (c *containerServer) handleExecveStarted(pid int) error { 141 | // At this point, either recv kill / send result would be happened 142 | // host -> container: kill 143 | // container -> host: result 144 | // container -> host: done 145 | 146 | // Let's register a wait event 147 | c.waitPid <- pid 148 | 149 | var ret waitPidResult 150 | select { 151 | case <-c.done: // socket error happened 152 | return c.err 153 | 154 | case <-c.recvCh: // kill cmd received 155 | syscall.Kill(-1, syscall.SIGKILL) 156 | ret = <-c.waitPidResult 157 | c.waitAll <- struct{}{} 158 | 159 | if err := c.sendReply(convertReply(ret), unixsocket.Msg{}); err != nil { 160 | return err 161 | } 162 | 163 | case ret = <-c.waitPidResult: // child process returned 164 | syscall.Kill(-1, syscall.SIGKILL) 165 | c.waitAll <- struct{}{} 166 | 167 | if err := c.sendReply(convertReply(ret), unixsocket.Msg{}); err != nil { 168 | return err 169 | } 170 | if _, _, err := c.recvCmd(); err != nil { // kill cmd received 171 | return err 172 | } 173 | } 174 | <-c.waitAllDone 175 | return nil 176 | } 177 | 178 | func convertReply(ret waitPidResult) reply { 179 | if ret.Err != nil { 180 | return reply{ 181 | Error: &errorReply{ 182 | Msg: fmt.Sprintf("execve: wait4: %v", ret.Err), 183 | }, 184 | } 185 | } 186 | 187 | waitStatus := ret.WaitStatus 188 | rusage := ret.Rusage 189 | 190 | status := runner.StatusNormal 191 | userTime := time.Duration(rusage.Utime.Nano()) // ns 192 | userMem := runner.Size(rusage.Maxrss << 10) // bytes 193 | switch { 194 | case waitStatus.Exited(): 195 | exitStatus := waitStatus.ExitStatus() 196 | if exitStatus != 0 { 197 | status = runner.StatusNonzeroExitStatus 198 | } 199 | return reply{ 200 | ExecReply: &execReply{ 201 | Status: status, 202 | ExitStatus: exitStatus, 203 | Time: userTime, 204 | Memory: userMem, 205 | }, 206 | } 207 | 208 | case waitStatus.Signaled(): 209 | switch waitStatus.Signal() { 210 | // kill signal treats as TLE 211 | case syscall.SIGXCPU, syscall.SIGKILL: 212 | status = runner.StatusTimeLimitExceeded 213 | case syscall.SIGXFSZ: 214 | status = runner.StatusOutputLimitExceeded 215 | case syscall.SIGSYS: 216 | status = runner.StatusDisallowedSyscall 217 | default: 218 | status = runner.StatusSignalled 219 | } 220 | return reply{ 221 | ExecReply: &execReply{ 222 | ExitStatus: int(waitStatus.Signal()), 223 | Status: status, 224 | Time: userTime, 225 | Memory: userMem, 226 | }, 227 | } 228 | 229 | default: 230 | return reply{ 231 | Error: &errorReply{ 232 | Msg: fmt.Sprintf("execve: unknown status: %v", waitStatus), 233 | }, 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /container/doc.go: -------------------------------------------------------------------------------- 1 | // Package container provides pre-forked container environment to 2 | // run programs in isolated Linux namespaces. 3 | // 4 | // # Overview 5 | // 6 | // It creates container within unshared container and communicate 7 | // with host process using unix socket with 8 | // oob for fd / pid and commands encoded by gob. 9 | // 10 | // # Protocol 11 | // 12 | // Host to container communication protocol is single threaded and always initiated by 13 | // the host: 14 | // 15 | // ## ping (alive check) 16 | // 17 | // - send: ping 18 | // - reply: pong 19 | // 20 | // ## conf (set configuration) 21 | // 22 | // - send: conf 23 | // - reply: 24 | // 25 | // ## open (open files in given mode inside container): 26 | // 27 | // - send: []OpenCmd 28 | // - reply: "success", file fds / "error" 29 | // 30 | // ## delete (unlink file / rmdir dir inside container): 31 | // 32 | // - send: path 33 | // - reply: "finished" / "error" 34 | // 35 | // ## reset (clean up container for later use (clear workdir / tmp)): 36 | // 37 | // - send: 38 | // - reply: "success" 39 | // 40 | // ## execve: (execute file inside container): 41 | // 42 | // - send: argv, env, rLimits, fds 43 | // - reply: 44 | // - success: "success", pid 45 | // - failed: "failed" 46 | // - send (success): "init_finished" (as cmd) 47 | // - reply: "finished" / send: "kill" (as cmd) 48 | // - send: "kill" (as cmd) / reply: "finished" 49 | // - reply: 50 | // 51 | // Any socket related error will cause the container exit with all process inside container 52 | package container 53 | -------------------------------------------------------------------------------- /container/host_cmd_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "syscall" 7 | "time" 8 | 9 | "github.com/criyle/go-sandbox/pkg/unixsocket" 10 | ) 11 | 12 | // Ping send ping message to container, wait for 3 second before timeout 13 | func (c *container) Ping() error { 14 | c.mu.Lock() 15 | defer c.mu.Unlock() 16 | 17 | // avoid infinite wait (max 3s) 18 | const pingWait = 3 * time.Second 19 | c.socket.SetDeadline(time.Now().Add(pingWait)) 20 | defer c.socket.SetDeadline(time.Time{}) 21 | 22 | // send ping 23 | cmd := cmd{ 24 | Cmd: cmdPing, 25 | } 26 | if err := c.sendCmd(cmd, unixsocket.Msg{}); err != nil { 27 | return fmt.Errorf("ping: %w", err) 28 | } 29 | // receive no error 30 | return c.recvAckReply("ping") 31 | } 32 | 33 | // conf send configuration to container (used by builder only) 34 | func (c *container) conf(conf *containerConfig) error { 35 | c.mu.Lock() 36 | defer c.mu.Unlock() 37 | 38 | cmd := cmd{ 39 | Cmd: cmdConf, 40 | ConfCmd: &confCmd{Conf: *conf}, 41 | } 42 | if err := c.sendCmd(cmd, unixsocket.Msg{}); err != nil { 43 | return fmt.Errorf("conf: %w", err) 44 | } 45 | return c.recvAckReply("conf") 46 | } 47 | 48 | // Open open files in container 49 | func (c *container) Open(p []OpenCmd) ([]*os.File, error) { 50 | c.mu.Lock() 51 | defer c.mu.Unlock() 52 | 53 | syscall.ForkLock.RLock() 54 | defer syscall.ForkLock.RUnlock() 55 | 56 | // send copyin 57 | cmd := cmd{ 58 | Cmd: cmdOpen, 59 | OpenCmd: p, 60 | } 61 | if err := c.sendCmd(cmd, unixsocket.Msg{}); err != nil { 62 | return nil, fmt.Errorf("open: %w", err) 63 | } 64 | reply, msg, err := c.recvReply() 65 | if err != nil { 66 | return nil, fmt.Errorf("open: %w", err) 67 | } 68 | if reply.Error != nil { 69 | return nil, fmt.Errorf("open: %v", reply.Error) 70 | } 71 | if len(msg.Fds) != len(p) { 72 | closeFds(msg.Fds) 73 | return nil, fmt.Errorf("open: unexpected number of fds: got %d, want %d", len(msg.Fds), len(p)) 74 | } 75 | 76 | ret := make([]*os.File, 0, len(p)) 77 | for i, fd := range msg.Fds { 78 | syscall.CloseOnExec(fd) 79 | f := os.NewFile(uintptr(fd), p[i].Path) 80 | if f == nil { 81 | closeFds(msg.Fds) 82 | return nil, fmt.Errorf("open: failed to create file for fd: %d", fd) 83 | } 84 | ret = append(ret, f) 85 | } 86 | return ret, nil 87 | } 88 | 89 | // Delete remove file from container 90 | func (c *container) Delete(p string) error { 91 | c.mu.Lock() 92 | defer c.mu.Unlock() 93 | 94 | cmd := cmd{ 95 | Cmd: cmdDelete, 96 | DeleteCmd: &deleteCmd{Path: p}, 97 | } 98 | if err := c.sendCmd(cmd, unixsocket.Msg{}); err != nil { 99 | return fmt.Errorf("delete: %w", err) 100 | } 101 | return c.recvAckReply("delete") 102 | } 103 | 104 | // Reset remove all from /tmp and /w 105 | func (c *container) Reset() error { 106 | c.mu.Lock() 107 | defer c.mu.Unlock() 108 | 109 | cmd := cmd{ 110 | Cmd: cmdReset, 111 | } 112 | if err := c.sendCmd(cmd, unixsocket.Msg{}); err != nil { 113 | return fmt.Errorf("reset: %w", err) 114 | } 115 | return c.recvAckReply("reset") 116 | } 117 | -------------------------------------------------------------------------------- /container/host_exec_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/criyle/go-sandbox/pkg/rlimit" 9 | "github.com/criyle/go-sandbox/pkg/seccomp" 10 | "github.com/criyle/go-sandbox/pkg/unixsocket" 11 | "github.com/criyle/go-sandbox/runner" 12 | ) 13 | 14 | // ExecveParam is parameters to run process inside container 15 | type ExecveParam struct { 16 | // Args holds command line arguments 17 | Args []string 18 | 19 | // Env specifies the environment of the process 20 | Env []string 21 | 22 | // Files specifies file descriptors for the child process 23 | Files []uintptr 24 | 25 | // ExecFile specifies file descriptor for executable file using fexecve 26 | ExecFile uintptr 27 | 28 | // CgroupFD specifies file descriptor for cgroup V2 29 | CgroupFD uintptr 30 | 31 | // RLimits specifies POSIX Resource limit through setrlimit 32 | RLimits []rlimit.RLimit 33 | 34 | // Seccomp specifies seccomp filter 35 | Seccomp seccomp.Filter 36 | 37 | // CTTY specifies whether to set controlling TTY 38 | CTTY bool 39 | 40 | // SyncFunc calls with pid just before execve (for attach the process to cgroups) 41 | SyncFunc func(pid int) error 42 | 43 | // SyncAfterExec makes syncFunc sync after the start of the execution 44 | // Thus, since pid is not guarantee to be exist (may exit early), it is not passed 45 | SyncAfterExec bool 46 | } 47 | 48 | // Execve runs process inside container. It accepts context cancellation as time limit exceeded. 49 | func (c *container) Execve(ctx context.Context, param ExecveParam) runner.Result { 50 | c.mu.Lock() 51 | defer c.mu.Unlock() 52 | 53 | sTime := time.Now() 54 | 55 | // if execve with fd, put fd at the first parameter 56 | var files []int 57 | if param.ExecFile > 0 { 58 | files = append(files, int(param.ExecFile)) 59 | } 60 | if param.CgroupFD > 0 { 61 | files = append(files, int(param.CgroupFD)) 62 | } 63 | files = append(files, uintptrSliceToInt(param.Files)...) 64 | msg := unixsocket.Msg{ 65 | Fds: files, 66 | } 67 | execCmd := &execCmd{ 68 | Argv: param.Args, 69 | Env: param.Env, 70 | RLimits: param.RLimits, 71 | Seccomp: param.Seccomp, 72 | FdExec: param.ExecFile > 0, 73 | CTTY: param.CTTY, 74 | SyncAfter: param.SyncAfterExec, 75 | FdCgroup: param.CgroupFD > 0, 76 | } 77 | cm := cmd{ 78 | Cmd: cmdExecve, 79 | ExecCmd: execCmd, 80 | } 81 | if err := c.sendCmd(cm, msg); err != nil { 82 | return errResult("execve: sendCmd %v", err) 83 | } 84 | // sync function 85 | rep, msg, err := c.recvReply() 86 | if err != nil { 87 | return errResult("execve: recvReply %v", err) 88 | } 89 | // if sync function did not involved 90 | if rep.Error != nil { 91 | return errResult("execve: %v", rep.Error) 92 | } 93 | // if pid not received 94 | if msg.Cred == nil { 95 | // tell kill function to exit and sync 96 | c.execveSyncKill() 97 | // tell err exec function to exit and sync 98 | c.execveSyncKill() 99 | return errResult("execve: no pid received") 100 | } 101 | if param.SyncFunc != nil { 102 | if err := param.SyncFunc(int(msg.Cred.Pid)); err != nil { 103 | // tell sync function to exit and recv error 104 | c.execveSyncKill() 105 | return errResult("execve: syncfunc failed %v", err) 106 | } 107 | } 108 | // send to syncFunc ack ok 109 | if err := c.sendCmd(cmd{Cmd: cmdOk}, unixsocket.Msg{}); err != nil { 110 | return errResult("execve: ack failed %v", err) 111 | } 112 | 113 | // wait for done 114 | return c.waitForDone(ctx, sTime) 115 | } 116 | 117 | func (c *container) waitForDone(ctx context.Context, sTime time.Time) runner.Result { 118 | mTime := time.Now() 119 | select { 120 | case <-c.done: // socket error 121 | return convertReplyResult(reply{}, sTime, mTime, c.err) 122 | 123 | case <-ctx.Done(): // cancel 124 | c.sendCmd(cmd{Cmd: cmdKill}, unixsocket.Msg{}) // kill 125 | reply, _, err := c.recvReply() 126 | return convertReplyResult(reply, sTime, mTime, err) 127 | 128 | case ret := <-c.recvCh: // result 129 | err := c.sendCmd(cmd{Cmd: cmdKill}, unixsocket.Msg{}) // kill 130 | return convertReplyResult(ret.Reply, sTime, mTime, err) 131 | } 132 | } 133 | 134 | func convertReplyResult(reply reply, sTime, mTime time.Time, err error) runner.Result { 135 | // handle potential error 136 | if err != nil { 137 | return runner.Result{ 138 | Status: runner.StatusRunnerError, 139 | Error: err.Error(), 140 | } 141 | } 142 | if reply.Error != nil { 143 | return runner.Result{ 144 | Status: runner.StatusRunnerError, 145 | Error: reply.Error.Error(), 146 | } 147 | } 148 | if reply.ExecReply == nil { 149 | return runner.Result{ 150 | Status: runner.StatusRunnerError, 151 | Error: "execve: no reply received", 152 | } 153 | } 154 | // emit result after all communication finish 155 | return runner.Result{ 156 | Status: reply.ExecReply.Status, 157 | ExitStatus: reply.ExecReply.ExitStatus, 158 | Time: reply.ExecReply.Time, 159 | Memory: reply.ExecReply.Memory, 160 | SetUpTime: mTime.Sub(sTime), 161 | RunningTime: time.Since(mTime), 162 | } 163 | } 164 | 165 | // execveSyncKill will send kill and recv reply 166 | func (c *container) execveSyncKill() { 167 | c.sendCmd(cmd{Cmd: cmdKill}, unixsocket.Msg{}) 168 | c.recvReply() 169 | } 170 | 171 | func errResult(f string, v ...interface{}) runner.Result { 172 | return runner.Result{ 173 | Status: runner.StatusRunnerError, 174 | Error: fmt.Sprintf(f, v...), 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /container/lookup_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "errors" 5 | "io/fs" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | ) 10 | 11 | var ( 12 | errNotFound = errors.New("executable file not found in $PATH") 13 | errNoPath = errors.New("no PATH environment variable provided for look up") 14 | ) 15 | 16 | func findExecutable(file string) error { 17 | d, err := os.Stat(file) 18 | if err != nil { 19 | return err 20 | } 21 | if m := d.Mode(); !m.IsDir() && m&0111 != 0 { 22 | return nil 23 | } 24 | return fs.ErrPermission 25 | } 26 | 27 | func lookPath(name string, env []string) (string, error) { 28 | // don't look if abs path provided 29 | if filepath.Base(name) != name { 30 | return name, nil 31 | } 32 | 33 | // don't look if exist in current dir 34 | if err := findExecutable(name); err == nil { 35 | return name, nil 36 | } 37 | 38 | path, err := findPath(env) 39 | if err != nil { 40 | return "", err 41 | } 42 | for _, dir := range path { 43 | if dir == "" { 44 | dir = "." 45 | } 46 | p := filepath.Join(dir, name) 47 | if err := findExecutable(p); err == nil { 48 | return p, nil 49 | } 50 | } 51 | return "", errNotFound 52 | } 53 | 54 | func findPath(env []string) ([]string, error) { 55 | // find PATH= 56 | const pathPrefix = "PATH=" 57 | for i := len(env) - 1; i >= 0; i-- { 58 | s := env[i] 59 | if strings.HasPrefix(s, pathPrefix) { 60 | return filepath.SplitList(s[len(pathPrefix):]), nil 61 | } 62 | } 63 | return nil, errNoPath 64 | } 65 | -------------------------------------------------------------------------------- /container/protocol_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "os" 5 | "syscall" 6 | "time" 7 | 8 | "github.com/criyle/go-sandbox/pkg/mount" 9 | "github.com/criyle/go-sandbox/pkg/rlimit" 10 | "github.com/criyle/go-sandbox/pkg/seccomp" 11 | "github.com/criyle/go-sandbox/runner" 12 | ) 13 | 14 | // cmd is the control message send into container 15 | type cmd struct { 16 | DeleteCmd *deleteCmd // delete argument 17 | ExecCmd *execCmd // execve argument 18 | ConfCmd *confCmd // to set configuration 19 | 20 | OpenCmd []OpenCmd // open argument 21 | 22 | Cmd cmdType // type of the cmd 23 | } 24 | 25 | // OpenCmd correspond to a single open syscall 26 | type OpenCmd struct { 27 | Path string 28 | Flag int 29 | Perm os.FileMode 30 | } 31 | 32 | // deleteCmd stores delete command 33 | type deleteCmd struct { 34 | Path string 35 | } 36 | 37 | // execCmd stores execve parameter 38 | type execCmd struct { 39 | Argv []string // execve argv 40 | Env []string // execve env 41 | RLimits []rlimit.RLimit // execve posix rlimit 42 | Seccomp seccomp.Filter // seccomp filter 43 | FdExec bool // if use fexecve (fd[0] as exec) 44 | FdCgroup bool // if use cgroupFd 45 | CTTY bool // if set CTTY 46 | SyncAfter bool // if sync function calls after execve returns 47 | } 48 | 49 | // confCmd stores conf parameter 50 | type confCmd struct { 51 | Conf containerConfig 52 | } 53 | 54 | // ContainerConfig set the container config 55 | type containerConfig struct { 56 | WorkDir string 57 | 58 | HostName string 59 | DomainName string 60 | 61 | ContainerRoot string 62 | Mounts []mount.Mount 63 | SymbolicLinks []SymbolicLink 64 | MaskPaths []string 65 | InitCommand []string 66 | 67 | ContainerUID int 68 | ContainerGID int 69 | Cred bool 70 | UnshareCgroup bool 71 | } 72 | 73 | // reply is the reply message send back to controller 74 | type reply struct { 75 | Error *errorReply // nil if no error 76 | ExecReply *execReply 77 | } 78 | 79 | // errorReply stores error returned back from container 80 | type errorReply struct { 81 | Errno *syscall.Errno 82 | Msg string 83 | } 84 | 85 | // execReply stores execve result 86 | type execReply struct { 87 | ExitStatus int // waitpid exit status 88 | Status runner.Status // return status 89 | Time time.Duration // waitpid user CPU (ns) 90 | Memory runner.Size // waitpid user memory (byte) 91 | } 92 | 93 | func (e *errorReply) Error() string { 94 | return e.Msg 95 | } 96 | -------------------------------------------------------------------------------- /container/signal_linux.go: -------------------------------------------------------------------------------- 1 | //go:build linux && !mips64 && !mips64le 2 | 3 | package container 4 | 5 | import ( 6 | "os" 7 | "syscall" 8 | ) 9 | 10 | var signalToIgnore = []os.Signal{ 11 | // signals that cause run-time panic 12 | syscall.SIGBUS, syscall.SIGFPE, syscall.SIGSEGV, 13 | // signals that cause the program to exit 14 | syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, 15 | // signals that cause the program to exit with a stack dump 16 | syscall.SIGQUIT, syscall.SIGILL, syscall.SIGTRAP, syscall.SIGABRT, syscall.SIGSTKFLT, syscall.SIGSYS, 17 | } 18 | -------------------------------------------------------------------------------- /container/signal_linux_mips64x.go: -------------------------------------------------------------------------------- 1 | //go:build linux && (mips64 || mips64le) 2 | 3 | package container 4 | 5 | import ( 6 | "os" 7 | "syscall" 8 | ) 9 | 10 | var signalToIgnore = []os.Signal{ 11 | // signals that cause run-time panic 12 | syscall.SIGBUS, syscall.SIGFPE, syscall.SIGSEGV, 13 | // signals that cause the program to exit 14 | syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, 15 | // signals that cause the program to exit with a stack dump 16 | syscall.SIGQUIT, syscall.SIGILL, syscall.SIGTRAP, syscall.SIGABRT, syscall.SIGSYS, 17 | } 18 | -------------------------------------------------------------------------------- /container/socket_linux.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "fmt" 7 | 8 | "github.com/criyle/go-sandbox/pkg/unixsocket" 9 | ) 10 | 11 | // 16k buffer size 12 | const bufferSize = 16 << 10 13 | 14 | type socket struct { 15 | *unixsocket.Socket 16 | 17 | buff []byte 18 | 19 | decoder *gob.Decoder 20 | recvBuff bufferRotator 21 | 22 | encoder *gob.Encoder 23 | sendBuff bytes.Buffer 24 | } 25 | 26 | // bufferRotator replace the underlying Buffers to avoid allocation 27 | type bufferRotator struct { 28 | *bytes.Buffer 29 | } 30 | 31 | func (b *bufferRotator) Rotate(buffer *bytes.Buffer) { 32 | b.Buffer = buffer 33 | } 34 | 35 | func newSocket(s *unixsocket.Socket) *socket { 36 | soc := socket{ 37 | Socket: s, 38 | } 39 | soc.buff = make([]byte, bufferSize) 40 | soc.decoder = gob.NewDecoder(&soc.recvBuff) 41 | soc.encoder = gob.NewEncoder(&soc.sendBuff) 42 | 43 | return &soc 44 | } 45 | 46 | func (s *socket) RecvMsg(e any) (msg unixsocket.Msg, err error) { 47 | n, msg, err := s.Socket.RecvMsg(s.buff) 48 | if err != nil { 49 | return msg, fmt.Errorf("recv msg: %w", err) 50 | } 51 | s.recvBuff.Rotate(bytes.NewBuffer(s.buff[:n])) 52 | 53 | if err := s.decoder.Decode(e); err != nil { 54 | return msg, fmt.Errorf("recv msg: decode: %w", err) 55 | } 56 | return msg, nil 57 | } 58 | 59 | func (s *socket) SendMsg(e any, msg unixsocket.Msg) error { 60 | s.sendBuff.Reset() 61 | if err := s.encoder.Encode(e); err != nil { 62 | return fmt.Errorf("send msg: encode: %w", err) 63 | } 64 | 65 | if err := s.Socket.SendMsg(s.sendBuff.Bytes(), msg); err != nil { 66 | return fmt.Errorf("send msg: %w", err) 67 | } 68 | return nil 69 | } 70 | -------------------------------------------------------------------------------- /container/utils.go: -------------------------------------------------------------------------------- 1 | package container 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "syscall" 7 | ) 8 | 9 | func intSliceToUintptr(s []int) []uintptr { 10 | var r []uintptr 11 | if len(s) > 0 { 12 | r = make([]uintptr, len(s)) 13 | for i, x := range s { 14 | r[i] = uintptr(x) 15 | } 16 | } 17 | return r 18 | } 19 | 20 | func uintptrSliceToInt(s []uintptr) []int { 21 | var r []int 22 | if len(s) > 0 { 23 | r = make([]int, len(s)) 24 | for i, x := range s { 25 | r[i] = int(x) 26 | } 27 | } 28 | return r 29 | } 30 | 31 | func closeOnExecFds(s []int) { 32 | for _, f := range s { 33 | syscall.CloseOnExec(f) 34 | } 35 | } 36 | 37 | func closeFds(s []int) { 38 | for _, f := range s { 39 | syscall.Close(f) 40 | } 41 | } 42 | 43 | // removeContents delete content of a directory 44 | func removeContents(dir string) error { 45 | d, err := os.Open(dir) 46 | if err != nil { 47 | return err 48 | } 49 | defer d.Close() 50 | 51 | names, err := d.Readdirnames(-1) 52 | if err != nil { 53 | return err 54 | } 55 | 56 | for _, name := range names { 57 | err1 := os.RemoveAll(filepath.Join(dir, name)) 58 | if err != nil { 59 | err = err1 60 | } 61 | } 62 | return err 63 | } 64 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/criyle/go-sandbox 2 | 3 | go 1.24 4 | 5 | require ( 6 | github.com/elastic/go-seccomp-bpf v1.5.0 7 | golang.org/x/net v0.38.0 8 | golang.org/x/sys v0.31.0 9 | ) 10 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/elastic/go-seccomp-bpf v1.5.0 h1:gJV+U1iP+YC70ySyGUUNk2YLJW5/IkEw4FZBJfW8ZZY= 4 | github.com/elastic/go-seccomp-bpf v1.5.0/go.mod h1:umdhQ/3aybliBF2jjiZwS492I/TOKz+ZRvsLT3hVe1o= 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 8 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 9 | golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= 10 | golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= 11 | golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= 12 | golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 13 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 14 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 15 | -------------------------------------------------------------------------------- /pkg/cgroup/benchmark_linux_test.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func BenchmarkCgroup(b *testing.B) { 9 | if err := EnableV2Nesting(); err != nil { 10 | b.Fatal(err) 11 | } 12 | ct, err := GetAvailableControllerV2() 13 | if err != nil { 14 | b.Fatal(err) 15 | } 16 | builder, err := New("benchmark", ct) 17 | if err != nil { 18 | b.Fatal(err) 19 | } 20 | defer builder.Destroy() 21 | b.ResetTimer() 22 | for i := 0; i < b.N; i++ { 23 | cg, err := builder.New("test") 24 | if err != nil { 25 | b.Fatal(err) 26 | } 27 | if err := cg.SetCPUSet([]byte("0")); err != nil { 28 | b.Fatal(err) 29 | } 30 | if err := cg.SetMemoryLimit(4096); err != nil { 31 | b.Fatal(err) 32 | } 33 | if err := cg.SetProcLimit(1); err != nil { 34 | b.Fatal(err) 35 | } 36 | if _, err := cg.CPUUsage(); err != nil { 37 | b.Fatal(err) 38 | } 39 | if _, err := cg.MemoryMaxUsage(); err != nil { 40 | b.Fatal(err) 41 | } 42 | cg.Destroy() 43 | } 44 | } 45 | 46 | func TestCgroupAll(t *testing.T) { 47 | // ensure root privilege when testing 48 | if os.Getuid() != 0 { 49 | t.Skip("no root privilege") 50 | } 51 | if err := EnableV2Nesting(); err != nil { 52 | t.Fatal(err) 53 | } 54 | ct, err := GetAvailableControllerV2() 55 | if err != nil { 56 | t.Fatal(err) 57 | } 58 | builder, err := New("benchmark", ct) 59 | if err != nil { 60 | t.Fatal(err) 61 | } 62 | defer builder.Destroy() 63 | if err != nil { 64 | t.Fatal(err) 65 | } 66 | cg, err := builder.New("test") 67 | if err != nil { 68 | t.Fatal(err) 69 | } 70 | t.Cleanup(func() { 71 | cg.Destroy() 72 | }) 73 | if err := cg.SetCPUSet([]byte("0")); err != nil { 74 | t.Fatal(err) 75 | } 76 | if err := cg.SetMemoryLimit(4096); err != nil { 77 | t.Fatal(err) 78 | } 79 | if err := cg.SetProcLimit(1); err != nil { 80 | t.Fatal(err) 81 | } 82 | if _, err := cg.CPUUsage(); err != nil { 83 | t.Fatal(err) 84 | } 85 | if _, err := cg.MemoryMaxUsage(); err != nil { 86 | t.Fatal(err) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /pkg/cgroup/cgroup_info_linux.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | const numberOfControllers = 5 13 | 14 | // Controllers defines enabled controller of a cgroup 15 | type Controllers struct { 16 | CPU bool 17 | CPUSet bool 18 | CPUAcct bool 19 | Memory bool 20 | Pids bool 21 | } 22 | 23 | // Set changes the enabled status of a specific controller 24 | func (c *Controllers) Set(ct string, value bool) { 25 | switch ct { 26 | case CPU: 27 | c.CPU = value 28 | case CPUSet: 29 | c.CPUSet = value 30 | case CPUAcct: 31 | c.CPUAcct = value 32 | case Memory: 33 | c.Memory = value 34 | case Pids: 35 | c.Pids = value 36 | } 37 | } 38 | 39 | // Intersect reset the specific controller if it is not enabled in the other 40 | func (c *Controllers) Intersect(o *Controllers) { 41 | c.CPU = c.CPU && o.CPU 42 | c.CPUSet = c.CPUSet && o.CPUSet 43 | c.CPUAcct = c.CPUAcct && o.CPUAcct 44 | c.Memory = c.Memory && o.Memory 45 | c.Pids = c.Pids && o.Pids 46 | } 47 | 48 | // Contains returns true if the current controller enabled all controllers in the other controller 49 | func (c *Controllers) Contains(o *Controllers) bool { 50 | return (c.CPU || !o.CPU) && (c.CPUSet || !o.CPUSet) && (c.CPUAcct || !o.CPUAcct) && 51 | (c.Memory || !o.Memory) && (c.Pids || !o.Pids) 52 | } 53 | 54 | // Names returns a list of string of all enabled container names 55 | func (c *Controllers) Names() []string { 56 | names := make([]string, 0, numberOfControllers) 57 | for _, v := range []struct { 58 | e bool 59 | n string 60 | }{ 61 | {c.CPU, CPU}, 62 | {c.CPUAcct, CPUAcct}, 63 | {c.CPUSet, CPUSet}, 64 | {c.Memory, Memory}, 65 | {c.Pids, Pids}, 66 | } { 67 | if v.e { 68 | names = append(names, v.n) 69 | } 70 | } 71 | return names 72 | } 73 | 74 | func (c *Controllers) String() string { 75 | return "[" + strings.Join(c.Names(), ", ") + "]" 76 | } 77 | 78 | // Info reads the cgroup mount info from /proc/cgroups 79 | type Info struct { 80 | Hierarchy int 81 | NumCgroups int 82 | Enabled bool 83 | } 84 | 85 | // GetCgroupV1Info read /proc/cgroups and return the result 86 | func GetCgroupV1Info() (map[string]Info, error) { 87 | f, err := os.Open(procCgroupsPath) 88 | if err != nil { 89 | return nil, err 90 | } 91 | defer f.Close() 92 | 93 | rt := make(map[string]Info) 94 | s := bufio.NewScanner(f) 95 | for s.Scan() { 96 | text := s.Text() 97 | if text[0] == '#' { 98 | continue 99 | } 100 | parts := strings.Fields(text) 101 | if len(parts) < 4 { 102 | continue 103 | } 104 | 105 | // format: subsys_name hierarchy num_cgroups enabled 106 | name := parts[0] 107 | hierarchy, err := strconv.Atoi(parts[1]) 108 | if err != nil { 109 | return nil, err 110 | } 111 | numCgroups, err := strconv.Atoi(parts[2]) 112 | if err != nil { 113 | return nil, err 114 | } 115 | enabled := parts[3] != "0" 116 | rt[name] = Info{ 117 | Hierarchy: hierarchy, 118 | NumCgroups: numCgroups, 119 | Enabled: enabled, 120 | } 121 | } 122 | if err := s.Err(); err != nil { 123 | return nil, err 124 | } 125 | return rt, nil 126 | } 127 | 128 | // GetCurrentCgroupPrefix returns the cgroup prefix of current process 129 | func GetCurrentCgroupPrefix() (string, error) { 130 | c, err := os.ReadFile(procSelfCgroup) 131 | if err != nil { 132 | return "", err 133 | } 134 | firstLine, _, _ := strings.Cut(string(c), "\n") 135 | f := strings.Split(firstLine, ":") 136 | if len(f) < 3 { 137 | return "", fmt.Errorf("invalid " + procSelfCgroup) 138 | } 139 | return f[2][1:], nil 140 | } 141 | 142 | // GetAvailableController returns available cgroup controller in the system 143 | func GetAvailableController() (*Controllers, error) { 144 | if DetectedCgroupType == TypeV1 { 145 | return GetAvailableControllerV1() 146 | } 147 | return GetAvailableControllerV2() 148 | } 149 | 150 | // GetAvailableControllerWithPrefix returns available cgroup controller within the cgroup prefix 151 | func GetAvailableControllerWithPrefix(prefix string) (*Controllers, error) { 152 | if DetectedCgroupType == TypeV1 { 153 | return GetAvailableControllerV1() 154 | } 155 | return getAvailableControllerV2(prefix) 156 | } 157 | 158 | // GetAvailableControllerV1 reads /proc/cgroups and get all available controller as set 159 | func GetAvailableControllerV1() (*Controllers, error) { 160 | info, err := GetCgroupV1Info() 161 | if err != nil { 162 | return nil, err 163 | } 164 | 165 | rt := &Controllers{} 166 | for k, v := range info { 167 | if !v.Enabled { 168 | continue 169 | } 170 | rt.Set(k, true) 171 | } 172 | return rt, nil 173 | } 174 | 175 | // GetAvailableControllerV2 reads /sys/fs/cgroup/cgroup.controllers to get all controller 176 | func GetAvailableControllerV2() (*Controllers, error) { 177 | return getAvailableControllerV2(".") 178 | } 179 | 180 | func getAvailableControllerV2(prefix string) (*Controllers, error) { 181 | return getAvailableControllerV2path(filepath.Join(basePath, prefix, cgroupControllers)) 182 | } 183 | 184 | func getAvailableControllerV2path(p string) (*Controllers, error) { 185 | c, err := readFile(p) 186 | if err != nil { 187 | return nil, err 188 | } 189 | 190 | m := &Controllers{} 191 | f := strings.Fields(string(c)) 192 | for _, v := range f { 193 | m.Set(v, true) 194 | } 195 | return m, nil 196 | } 197 | -------------------------------------------------------------------------------- /pkg/cgroup/consts_linux.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | // Cgroup constants 4 | const ( 5 | // systemd mounted cgroups 6 | basePath = "/sys/fs/cgroup" 7 | cgroupProcs = "cgroup.procs" 8 | procCgroupsPath = "/proc/cgroups" 9 | procSelfCgroup = "/proc/self/cgroup" 10 | 11 | cgroupSubtreeControl = "cgroup.subtree_control" 12 | cgroupControllers = "cgroup.controllers" 13 | 14 | filePerm = 0644 15 | dirPerm = 0755 16 | 17 | CPU = "cpu" 18 | CPUAcct = "cpuacct" 19 | CPUSet = "cpuset" 20 | Memory = "memory" 21 | Pids = "pids" 22 | ) 23 | 24 | // Type defines the version of cgroup 25 | type Type int 26 | 27 | // Type enum for cgroup 28 | const ( 29 | TypeV1 = iota + 1 30 | TypeV2 31 | ) 32 | 33 | func (t Type) String() string { 34 | switch t { 35 | case TypeV1: 36 | return "v1" 37 | case TypeV2: 38 | return "v2" 39 | default: 40 | return "invalid" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pkg/cgroup/doc.go: -------------------------------------------------------------------------------- 1 | // Package cgroup provides builder to create cgroup 2 | // under systemd defined mount path (i.e.,sys/fs/cgroup) including v1 and 3 | // v2 implementation. 4 | // 5 | // Available cgroup controller: 6 | // 7 | // cpu 8 | // cpuset 9 | // cpuacct 10 | // memory 11 | // pids 12 | // 13 | // Current not available: devices, freezer, net_cls, blkio, perf_event, net_prio, huge_tlb, rdma 14 | package cgroup 15 | -------------------------------------------------------------------------------- /pkg/cgroup/utils_linux.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/fs" 7 | "math/rand/v2" 8 | "os" 9 | "path/filepath" 10 | "strconv" 11 | "strings" 12 | "syscall" 13 | 14 | "golang.org/x/sys/unix" 15 | ) 16 | 17 | // EnsureDirExists creates directories if the path not exists 18 | func EnsureDirExists(path string) error { 19 | if _, err := os.Stat(path); os.IsNotExist(err) { 20 | return os.MkdirAll(path, dirPerm) 21 | } 22 | return os.ErrExist 23 | } 24 | 25 | // CreateV1ControllerPath create path for controller with given group, prefix 26 | func CreateV1ControllerPath(controller, prefix string) (string, error) { 27 | p := filepath.Join(basePath, controller, prefix) 28 | return p, EnsureDirExists(p) 29 | } 30 | 31 | const initPath = "init" 32 | 33 | // EnableV2Nesting migrates all process in the container to nested /init path 34 | // and enables all available controllers in the root cgroup 35 | func EnableV2Nesting() error { 36 | if DetectType() != TypeV2 { 37 | return nil 38 | } 39 | 40 | p, err := readFile(filepath.Join(basePath, cgroupProcs)) 41 | if err != nil { 42 | return err 43 | } 44 | procs := strings.Split(string(p), "\n") 45 | if len(procs) == 0 { 46 | return nil 47 | } 48 | 49 | // mkdir init 50 | if err := os.Mkdir(filepath.Join(basePath, initPath), dirPerm); err != nil && !errors.Is(err, os.ErrExist) { 51 | return err 52 | } 53 | // move all process into init cgroup 54 | procFile, err := os.OpenFile(filepath.Join(basePath, initPath, cgroupProcs), os.O_RDWR, filePerm) 55 | if err != nil { 56 | return err 57 | } 58 | for _, v := range procs { 59 | if _, err := procFile.WriteString(v); err != nil { 60 | continue 61 | //return err 62 | } 63 | } 64 | procFile.Close() 65 | return nil 66 | } 67 | 68 | // ReadProcesses reads cgroup.procs file and return pids individually 69 | func ReadProcesses(path string) ([]int, error) { 70 | content, err := readFile(path) 71 | if err != nil { 72 | return nil, err 73 | } 74 | procs := strings.Split(string(content), "\n") 75 | rt := make([]int, len(procs)) 76 | for i, x := range procs { 77 | if len(x) == 0 { 78 | continue 79 | } 80 | rt[i], err = strconv.Atoi(x) 81 | if err != nil { 82 | return nil, err 83 | } 84 | } 85 | return rt, nil 86 | } 87 | 88 | // AddProcesses add processes into cgroup.procs file 89 | func AddProcesses(path string, procs []int) error { 90 | f, err := os.OpenFile(path, os.O_RDWR, filePerm) 91 | if err != nil { 92 | return err 93 | } 94 | defer f.Close() 95 | for _, p := range procs { 96 | if _, err := f.WriteString(strconv.Itoa(p)); err != nil { 97 | return err 98 | } 99 | } 100 | return nil 101 | } 102 | 103 | // DetectType detects current mounted cgroup type in systemd default path 104 | func DetectType() Type { 105 | // if /sys/fs/cgroup is mounted as CGROUPV2 or TMPFS (V1) 106 | var st unix.Statfs_t 107 | if err := unix.Statfs(basePath, &st); err != nil { 108 | // ignore errors, defaulting to CgroupV1 109 | return TypeV1 110 | } 111 | if st.Type == unix.CGROUP2_SUPER_MAGIC { 112 | return TypeV2 113 | } 114 | return TypeV1 115 | } 116 | 117 | func remove(name string) error { 118 | if name != "" { 119 | // os.Remove tried to Unlink, then Rmdir. Since we only delete directories, use 120 | // Rmdir directly 121 | return syscall.Rmdir(name) 122 | } 123 | return nil 124 | } 125 | 126 | var errPatternHasSeparator = errors.New("pattern contains path separator") 127 | 128 | // prefixAndSuffix splits pattern by the last wildcard "*", if applicable, 129 | // returning prefix as the part before "*" and suffix as the part after "*". 130 | func prefixAndSuffix(pattern string) (prefix, suffix string, err error) { 131 | for i := 0; i < len(pattern); i++ { 132 | if os.IsPathSeparator(pattern[i]) { 133 | return "", "", errPatternHasSeparator 134 | } 135 | } 136 | if pos := strings.LastIndexByte(pattern, '*'); pos != -1 { 137 | prefix, suffix = pattern[:pos], pattern[pos+1:] 138 | } else { 139 | prefix = pattern 140 | } 141 | return prefix, suffix, nil 142 | } 143 | 144 | func readFile(p string) ([]byte, error) { 145 | data, err := os.ReadFile(p) 146 | for err != nil && errors.Is(err, syscall.EINTR) { 147 | data, err = os.ReadFile(p) 148 | } 149 | return data, err 150 | } 151 | 152 | func writeFile(p string, content []byte, perm fs.FileMode) error { 153 | err := os.WriteFile(p, content, perm) 154 | for err != nil && errors.Is(err, syscall.EINTR) { 155 | err = os.WriteFile(p, content, perm) 156 | } 157 | return err 158 | } 159 | 160 | func nextRandom() string { 161 | return strconv.Itoa(int(rand.Int32())) 162 | } 163 | 164 | // randomBuild creates a cgroup with random directory, similar to os.MkdirTemp 165 | func randomBuild(pattern string, build func(string) (Cgroup, error)) (Cgroup, error) { 166 | prefix, suffix, err := prefixAndSuffix(pattern) 167 | if err != nil { 168 | return nil, fmt.Errorf("cgroup.builder: random %w", err) 169 | } 170 | 171 | try := 0 172 | for { 173 | name := prefix + nextRandom() + suffix 174 | cg, err := build(name) 175 | if err == nil { 176 | return cg, nil 177 | } 178 | if errors.Is(err, os.ErrExist) || (cg != nil && cg.Existing()) { 179 | if try++; try < 10000 { 180 | continue 181 | } 182 | return nil, fmt.Errorf("cgroup.builder: tried 10000 times but failed") 183 | } 184 | return nil, fmt.Errorf("cgroup.builder: random %w", err) 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /pkg/cgroup/v1controller_linux.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "errors" 5 | "path/filepath" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | // v1controller is the accessor for single cgroup resource with given path 11 | type v1controller struct { 12 | path string 13 | } 14 | 15 | // ErrNotInitialized returned when trying to read from not initialized cgroup 16 | var ErrNotInitialized = errors.New("cgroup was not initialized") 17 | 18 | // newV1Controller creates a cgroup accessor with given path (path needs to be created in advance) 19 | func newV1Controller(p string) *v1controller { 20 | return &v1controller{path: p} 21 | } 22 | 23 | // WriteUint writes uint64 into given file 24 | func (c *v1controller) WriteUint(filename string, i uint64) error { 25 | if c == nil || c.path == "" { 26 | return nil 27 | } 28 | return c.WriteFile(filename, []byte(strconv.FormatUint(i, 10))) 29 | } 30 | 31 | // ReadUint read uint64 from given file 32 | func (c *v1controller) ReadUint(filename string) (uint64, error) { 33 | if c == nil || c.path == "" { 34 | return 0, ErrNotInitialized 35 | } 36 | b, err := c.ReadFile(filename) 37 | if err != nil { 38 | return 0, err 39 | } 40 | s, err := strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64) 41 | if err != nil { 42 | return 0, err 43 | } 44 | return s, nil 45 | } 46 | 47 | // WriteFile writes cgroup file and handles potential EINTR error while writes to 48 | // the slow device (cgroup) 49 | func (c *v1controller) WriteFile(name string, content []byte) error { 50 | if c == nil || c.path == "" { 51 | return ErrNotInitialized 52 | } 53 | p := filepath.Join(c.path, name) 54 | return writeFile(p, content, filePerm) 55 | } 56 | 57 | // ReadFile reads cgroup file and handles potential EINTR error while read to 58 | // the slow device (cgroup) 59 | func (c *v1controller) ReadFile(name string) ([]byte, error) { 60 | if c == nil || c.path == "" { 61 | return nil, nil 62 | } 63 | p := filepath.Join(c.path, name) 64 | return readFile(p) 65 | } 66 | 67 | func (c *v1controller) AddProc(pids ...int) error { 68 | return AddProcesses(filepath.Join(c.path, cgroupProcs), pids) 69 | } 70 | -------------------------------------------------------------------------------- /pkg/cgroup/v2_linux.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "os" 7 | "path/filepath" 8 | "strconv" 9 | "strings" 10 | "sync" 11 | ) 12 | 13 | // V2 provides cgroup interface for v2 14 | type V2 struct { 15 | path string 16 | control *Controllers 17 | subtreeOnce sync.Once 18 | subtreeErr error 19 | existing bool 20 | } 21 | 22 | var _ Cgroup = &V2{} 23 | 24 | func (c *V2) Open() (*os.File, error) { 25 | return os.OpenFile(c.path, 0, dirPerm) 26 | } 27 | 28 | func (c *V2) String() string { 29 | ct, _ := getAvailableControllerV2path(filepath.Join(c.path, cgroupControllers)) 30 | return "v2(" + c.path + ")" + ct.String() 31 | } 32 | 33 | // AddProc adds processes into the cgroup 34 | func (c *V2) AddProc(pids ...int) error { 35 | return AddProcesses(filepath.Join(c.path, cgroupProcs), pids) 36 | } 37 | 38 | // Processes returns all processes within the cgroup 39 | func (c *V2) Processes() ([]int, error) { 40 | return ReadProcesses(filepath.Join(c.path, cgroupProcs)) 41 | } 42 | 43 | // New creates a sub-cgroup based on the existing one 44 | func (c *V2) New(name string) (Cgroup, error) { 45 | if err := c.enableSubtreeControl(); err != nil { 46 | return nil, err 47 | } 48 | v2 := &V2{ 49 | path: filepath.Join(c.path, name), 50 | control: c.control, 51 | } 52 | if err := os.Mkdir(v2.path, dirPerm); err != nil { 53 | if !os.IsExist(err) { 54 | return nil, err 55 | } 56 | v2.existing = true 57 | } 58 | return v2, nil 59 | } 60 | 61 | // Nest creates a sub-cgroup, moves current process into that cgroup 62 | func (c *V2) Nest(name string) (Cgroup, error) { 63 | v2 := &V2{ 64 | path: filepath.Join(c.path, name), 65 | control: c.control, 66 | } 67 | if err := os.Mkdir(v2.path, dirPerm); err != nil { 68 | if !os.IsExist(err) { 69 | return nil, err 70 | } 71 | v2.existing = true 72 | } 73 | p, err := c.Processes() 74 | if err != nil { 75 | return nil, err 76 | } 77 | if err := v2.AddProc(p...); err != nil { 78 | return nil, err 79 | } 80 | if err := c.enableSubtreeControl(); err != nil { 81 | return nil, err 82 | } 83 | return v2, nil 84 | } 85 | 86 | func (c *V2) enableSubtreeControl() error { 87 | c.subtreeOnce.Do(func() { 88 | ct, err := getAvailableControllerV2path(filepath.Join(c.path, cgroupControllers)) 89 | if err != nil { 90 | c.subtreeErr = err 91 | return 92 | } 93 | ect, err := getAvailableControllerV2path(filepath.Join(c.path, cgroupSubtreeControl)) 94 | if err != nil { 95 | c.subtreeErr = err 96 | return 97 | } 98 | if ect.Contains(ct) { 99 | return 100 | } 101 | s := ct.Names() 102 | controlMsg := []byte("+" + strings.Join(s, " +")) 103 | c.subtreeErr = writeFile(filepath.Join(c.path, cgroupSubtreeControl), controlMsg, filePerm) 104 | }) 105 | return c.subtreeErr 106 | } 107 | 108 | // Random creates a sub-cgroup based on the existing one but the name is randomly generated 109 | func (c *V2) Random(pattern string) (Cgroup, error) { 110 | return randomBuild(pattern, c.New) 111 | } 112 | 113 | // Destroy destroys the cgroup 114 | func (c *V2) Destroy() error { 115 | if !c.existing { 116 | return remove(c.path) 117 | } 118 | return nil 119 | } 120 | 121 | // Existing returns true if the cgroup was opened rather than created 122 | func (c *V2) Existing() bool { 123 | return c.existing 124 | } 125 | 126 | // CPUUsage reads cpu.stat usage_usec 127 | func (c *V2) CPUUsage() (uint64, error) { 128 | b, err := c.ReadFile("cpu.stat") 129 | if err != nil { 130 | return 0, err 131 | } 132 | s := bufio.NewScanner(bytes.NewReader(b)) 133 | for s.Scan() { 134 | parts := strings.Fields(s.Text()) 135 | if len(parts) == 2 && parts[0] == "usage_usec" { 136 | v, err := strconv.Atoi(parts[1]) 137 | if err != nil { 138 | return 0, err 139 | } 140 | return uint64(v) * 1000, nil // to ns 141 | } 142 | } 143 | return 0, os.ErrNotExist 144 | } 145 | 146 | // MemoryUsage reads memory.current 147 | func (c *V2) MemoryUsage() (uint64, error) { 148 | if !c.control.Memory { 149 | return 0, ErrNotInitialized 150 | } 151 | return c.ReadUint("memory.current") 152 | } 153 | 154 | // MemoryMaxUsage reads memory.peak 155 | func (c *V2) MemoryMaxUsage() (uint64, error) { 156 | if !c.control.Memory { 157 | return 0, ErrNotInitialized 158 | } 159 | return c.ReadUint("memory.peak") 160 | } 161 | 162 | // ProcessPeak reads pids.peak 163 | func (c *V2) ProcessPeak() (uint64, error) { 164 | if !c.control.Pids { 165 | return 0, ErrNotInitialized 166 | } 167 | return c.ReadUint("pids.peak") 168 | } 169 | 170 | // SetCPUBandwidth set cpu.max quota period 171 | func (c *V2) SetCPUBandwidth(quota, period uint64) error { 172 | if !c.control.CPU { 173 | return ErrNotInitialized 174 | } 175 | content := strconv.FormatUint(quota, 10) + " " + strconv.FormatUint(period, 10) 176 | return c.WriteFile("cpu.max", []byte(content)) 177 | } 178 | 179 | // SetCPUSet sets cpuset.cpus 180 | func (c *V2) SetCPUSet(content []byte) error { 181 | if !c.control.CPUSet { 182 | return ErrNotInitialized 183 | } 184 | return c.WriteFile("cpuset.cpus", content) 185 | } 186 | 187 | // SetMemoryLimit memory.max 188 | func (c *V2) SetMemoryLimit(l uint64) error { 189 | if !c.control.Memory { 190 | return ErrNotInitialized 191 | } 192 | return c.WriteUint("memory.max", l) 193 | } 194 | 195 | // SetProcLimit pids.max 196 | func (c *V2) SetProcLimit(l uint64) error { 197 | if !c.control.Pids { 198 | return ErrNotInitialized 199 | } 200 | return c.WriteUint("pids.max", l) 201 | } 202 | 203 | // WriteUint writes uint64 into given file 204 | func (c *V2) WriteUint(filename string, i uint64) error { 205 | return c.WriteFile(filename, []byte(strconv.FormatUint(i, 10))) 206 | } 207 | 208 | // ReadUint read uint64 from given file 209 | func (c *V2) ReadUint(filename string) (uint64, error) { 210 | b, err := c.ReadFile(filename) 211 | if err != nil { 212 | return 0, err 213 | } 214 | s, err := strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64) 215 | if err != nil { 216 | return 0, err 217 | } 218 | return s, nil 219 | } 220 | 221 | // WriteFile writes cgroup file and handles potential EINTR error while writes to 222 | // the slow device (cgroup) 223 | func (c *V2) WriteFile(name string, content []byte) error { 224 | p := filepath.Join(c.path, name) 225 | return writeFile(p, content, filePerm) 226 | } 227 | 228 | // ReadFile reads cgroup file and handles potential EINTR error while read to 229 | // the slow device (cgroup) 230 | func (c *V2) ReadFile(name string) ([]byte, error) { 231 | p := filepath.Join(c.path, name) 232 | return readFile(p) 233 | } 234 | -------------------------------------------------------------------------------- /pkg/forkexec/bench_linux_test.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "os" 5 | "syscall" 6 | "testing" 7 | 8 | "github.com/criyle/go-sandbox/pkg/mount" 9 | "golang.org/x/sys/unix" 10 | ) 11 | 12 | // All testing data were from docker env on amd64 arch 13 | 14 | const ( 15 | roBind = unix.MS_BIND | unix.MS_NOSUID | unix.MS_PRIVATE | unix.MS_RDONLY 16 | ) 17 | 18 | var ( 19 | defaultBind = []string{"/usr", "/lib", "/lib64", "/bin"} 20 | ) 21 | 22 | func BenchmarkStdFork(b *testing.B) { 23 | f := openNull(b) 24 | defer f.Close() 25 | b.RunParallel(func(pb *testing.PB) { 26 | for pb.Next() { 27 | pid, err := syscall.ForkExec("/bin/echo", nil, &syscall.ProcAttr{ 28 | Env: []string{"PATH=/bin"}, 29 | Files: []uintptr{f.Fd(), f.Fd(), f.Fd()}, 30 | }) 31 | if err != nil { 32 | b.Fatal(err) 33 | } 34 | wait4(pid, b) 35 | } 36 | }) 37 | } 38 | 39 | func BenchmarkStdForkUser(b *testing.B) { 40 | f := openNull(b) 41 | defer f.Close() 42 | b.RunParallel(func(pb *testing.PB) { 43 | for pb.Next() { 44 | pid, err := syscall.ForkExec("/bin/echo", nil, &syscall.ProcAttr{ 45 | Env: []string{"PATH=/bin"}, 46 | Files: []uintptr{f.Fd(), f.Fd(), f.Fd()}, 47 | Sys: &syscall.SysProcAttr{ 48 | Cloneflags: syscall.CLONE_NEWUSER, 49 | }, 50 | }) 51 | if err != nil { 52 | b.Fatal(err) 53 | } 54 | wait4(pid, b) 55 | } 56 | }) 57 | } 58 | 59 | // BenchmarkSimpleFork is about 0.70ms/op 60 | func BenchmarkSimpleFork(b *testing.B) { 61 | r, f := getRunner(b) 62 | defer f.Close() 63 | benchmarkRun(r, b) 64 | } 65 | 66 | // BenchmarkUnsharePid is about 0.79ms/op 67 | func BenchmarkUnsharePid(b *testing.B) { 68 | r, f := getRunner(b) 69 | defer f.Close() 70 | r.CloneFlags = unix.CLONE_NEWPID 71 | benchmarkRun(r, b) 72 | } 73 | 74 | // BenchmarkUnshareUser is about 0.84ms/op 75 | func BenchmarkUnshareUser(b *testing.B) { 76 | r, f := getRunner(b) 77 | defer f.Close() 78 | r.CloneFlags = unix.CLONE_NEWUSER 79 | benchmarkRun(r, b) 80 | } 81 | 82 | // BenchmarkUnshareUts is about 0.78ms/op 83 | func BenchmarkUnshareUts(b *testing.B) { 84 | r, f := getRunner(b) 85 | defer f.Close() 86 | r.CloneFlags = unix.CLONE_NEWUTS 87 | benchmarkRun(r, b) 88 | } 89 | 90 | // BenchmarkUnshareCgroup is about 0.85ms/op 91 | func BenchmarkUnshareCgroup(b *testing.B) { 92 | r, f := getRunner(b) 93 | defer f.Close() 94 | r.CloneFlags = unix.CLONE_NEWCGROUP 95 | benchmarkRun(r, b) 96 | } 97 | 98 | // BenchmarkUnshareIpc is about 51ms/op 99 | func BenchmarkUnshareIpc(b *testing.B) { 100 | r, f := getRunner(b) 101 | defer f.Close() 102 | r.CloneFlags = unix.CLONE_NEWIPC 103 | benchmarkRun(r, b) 104 | } 105 | 106 | // BenchmarkUnshareMount is about 51ms/op 107 | func BenchmarkUnshareMount(b *testing.B) { 108 | r, f := getRunner(b) 109 | defer f.Close() 110 | r.CloneFlags = unix.CLONE_NEWNS 111 | benchmarkRun(r, b) 112 | } 113 | 114 | // BenchmarkUnshareNet is about 426ms/op 115 | func BenchmarkUnshareNet(b *testing.B) { 116 | r, f := getRunner(b) 117 | defer f.Close() 118 | r.CloneFlags = unix.CLONE_NEWNET 119 | benchmarkRun(r, b) 120 | } 121 | 122 | // BenchmarkFastUnshareMountPivot is about 104ms/op 123 | func BenchmarkFastUnshareMountPivot(b *testing.B) { 124 | root, err := os.MkdirTemp("", "ns") 125 | if err != nil { 126 | b.Errorf("failed to create temp dir") 127 | } 128 | defer os.RemoveAll(root) 129 | r, f := getRunner(b) 130 | defer f.Close() 131 | r.CloneFlags = unix.CLONE_NEWNS | unix.CLONE_NEWPID | unix.CLONE_NEWUSER | unix.CLONE_NEWUTS | unix.CLONE_NEWCGROUP 132 | r.PivotRoot = root 133 | r.NoNewPrivs = true 134 | r.DropCaps = true 135 | r.Mounts = getMounts(defaultBind) 136 | benchmarkRun(r, b) 137 | } 138 | 139 | // BenchmarkUnshareAll is about 800ms/op 140 | func BenchmarkUnshareAll(b *testing.B) { 141 | r, f := getRunner(b) 142 | defer f.Close() 143 | r.CloneFlags = UnshareFlags 144 | r.NoNewPrivs = true 145 | r.DropCaps = true 146 | benchmarkRun(r, b) 147 | } 148 | 149 | // BenchmarkUnshareMountPivot is about 880ms/op 150 | func BenchmarkUnshareMountPivot(b *testing.B) { 151 | root, err := os.MkdirTemp("", "ns") 152 | if err != nil { 153 | b.Errorf("failed to create temp dir") 154 | } 155 | defer os.RemoveAll(root) 156 | r, f := getRunner(b) 157 | defer f.Close() 158 | r.CloneFlags = UnshareFlags 159 | r.PivotRoot = root 160 | r.NoNewPrivs = true 161 | r.DropCaps = true 162 | r.Mounts = getMounts(defaultBind) 163 | benchmarkRun(r, b) 164 | } 165 | 166 | func getRunner(b *testing.B) (*Runner, *os.File) { 167 | f := openNull(b) 168 | return &Runner{ 169 | Args: []string{"/bin/echo"}, 170 | Env: []string{"PATH=/bin"}, 171 | Files: []uintptr{f.Fd(), f.Fd(), f.Fd()}, 172 | WorkDir: "/bin", 173 | }, f 174 | } 175 | 176 | func benchmarkRun(r *Runner, b *testing.B) { 177 | b.ResetTimer() 178 | b.RunParallel(func(pb *testing.PB) { 179 | for pb.Next() { 180 | pid, err := r.Start() 181 | if err != nil { 182 | b.Fatal(err) 183 | } 184 | wait4(pid, b) 185 | } 186 | }) 187 | } 188 | 189 | func getMounts(dirs []string) []mount.SyscallParams { 190 | builder := mount.NewBuilder() 191 | for _, d := range dirs { 192 | builder.WithMount(mount.Mount{ 193 | Source: d, 194 | Target: d[1:], 195 | Flags: roBind, 196 | }) 197 | } 198 | m, _ := builder.FilterNotExist().Build() 199 | return m 200 | } 201 | 202 | func openNull(b *testing.B) *os.File { 203 | f, err := os.OpenFile("/dev/null", os.O_RDWR, 0666) 204 | if err != nil { 205 | b.Errorf("Failed to open %v", err) 206 | } 207 | return f 208 | } 209 | 210 | func wait4(pid int, b *testing.B) { 211 | var wstat syscall.WaitStatus 212 | for { 213 | syscall.Wait4(pid, &wstat, 0, nil) 214 | if wstat.Exited() { 215 | if s := wstat.ExitStatus(); s != 0 { 216 | b.Errorf("Exited: %d", s) 217 | } 218 | break 219 | } 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /pkg/forkexec/clone3_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | // cloneArgs holds arguments for clone3 Linux syscall. 4 | // from src/syscall/exec_linux.go:196 5 | type cloneArgs struct { 6 | flags uint64 // Flags bit mask 7 | pidFD uint64 // Where to store PID file descriptor (int *) 8 | childTID uint64 // Where to store child TID, in child's memory (pid_t *) 9 | parentTID uint64 // Where to store child TID, in parent's memory (pid_t *) 10 | exitSignal uint64 // Signal to deliver to parent on child termination 11 | stack uint64 // Pointer to lowest byte of stack 12 | stackSize uint64 // Size of stack 13 | tls uint64 // Location of new TLS 14 | setTID uint64 // Pointer to a pid_t array (since Linux 5.5) 15 | setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5) 16 | cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7) 17 | } 18 | -------------------------------------------------------------------------------- /pkg/forkexec/consts_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "golang.org/x/sys/unix" 5 | ) 6 | 7 | // defines missing consts from syscall package 8 | const ( 9 | SECCOMP_SET_MODE_STRICT = 0 10 | SECCOMP_SET_MODE_FILTER = 1 11 | SECCOMP_FILTER_FLAG_TSYNC = 1 12 | 13 | // Unshare flags 14 | UnshareFlags = unix.CLONE_NEWIPC | unix.CLONE_NEWNET | unix.CLONE_NEWNS | 15 | unix.CLONE_NEWPID | unix.CLONE_NEWUSER | unix.CLONE_NEWUTS | unix.CLONE_NEWCGROUP 16 | 17 | // Read-only bind mount need to be remounted 18 | bindRo = unix.MS_BIND | unix.MS_RDONLY 19 | ) 20 | 21 | // used by unshare remount / to private 22 | var ( 23 | none = []byte("none\000") 24 | slash = []byte("/\000") 25 | empty = []byte("\000") 26 | tmpfs = []byte("tmpfs\000") 27 | 28 | // tmp dir made by pivot_root 29 | oldRoot = []byte("old_root\000") 30 | 31 | // set groups for unshare user 32 | setGIDAllow = []byte("allow") 33 | setGIDDeny = []byte("deny") 34 | 35 | // go does not allow constant uintptr to be negative... 36 | _AT_FDCWD = unix.AT_FDCWD 37 | 38 | // Drop all capabilities 39 | dropCapHeader = unix.CapUserHeader{ 40 | Version: unix.LINUX_CAPABILITY_VERSION_3, 41 | Pid: 0, 42 | } 43 | 44 | dropCapData = unix.CapUserData{ 45 | Effective: 0, 46 | Permitted: 0, 47 | Inheritable: 0, 48 | } 49 | 50 | // 1ms 51 | etxtbsyRetryInterval = unix.Timespec{ 52 | Nsec: 1 * 1000 * 1000, 53 | } 54 | ) 55 | 56 | const ( 57 | _SECURE_NOROOT = 1 << iota 58 | _SECURE_NOROOT_LOCKED 59 | 60 | _SECURE_NO_SETUID_FIXUP 61 | _SECURE_NO_SETUID_FIXUP_LOCKED 62 | 63 | _SECURE_KEEP_CAPS 64 | _SECURE_KEEP_CAPS_LOCKED 65 | 66 | _SECURE_NO_CAP_AMBIENT_RAISE 67 | _SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 68 | ) 69 | -------------------------------------------------------------------------------- /pkg/forkexec/doc.go: -------------------------------------------------------------------------------- 1 | // Package forkexec provides interface to run a subprocess with seccomp filter, rlimit and 2 | // containerized or ptraced. 3 | // 4 | // unshare cgroup namespace requires kernel >= 4.6 5 | // seccomp, unshare pid / user namespaces requires kernel >= 3.8 6 | // pipe2, dup3 requires kernel >= 2.6.27 7 | package forkexec 8 | -------------------------------------------------------------------------------- /pkg/forkexec/errloc_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "fmt" 5 | "syscall" 6 | ) 7 | 8 | // ErrorLocation defines the location where child process failed to exec 9 | type ErrorLocation int 10 | 11 | // ChildError defines the specific error and location where it failed 12 | type ChildError struct { 13 | Err syscall.Errno 14 | Location ErrorLocation 15 | Index int 16 | } 17 | 18 | // Location constants 19 | const ( 20 | LocClone ErrorLocation = iota + 1 21 | LocCloseWrite 22 | LocUnshareUserRead 23 | LocGetPid 24 | LocKeepCapability 25 | LocSetGroups 26 | LocSetGid 27 | LocSetUid 28 | LocDup3 29 | LocFcntl 30 | LocSetSid 31 | LocIoctl 32 | LocMountRoot 33 | LocMountTmpfs 34 | LocMountChdir 35 | LocMount 36 | LocMountMkdir 37 | LocPivotRoot 38 | LocUmount 39 | LocUnlink 40 | LocMountRootReadonly 41 | LocChdir 42 | LocSetRlimit 43 | LocSetNoNewPrivs 44 | LocDropCapability 45 | LocSetCap 46 | LocPtraceMe 47 | LocStop 48 | LocSeccomp 49 | LocSyncWrite 50 | LocSyncRead 51 | LocExecve 52 | ) 53 | 54 | var locToString = []string{ 55 | "unknown", 56 | "clone", 57 | "close_write", 58 | "unshare_user_read", 59 | "getpid", 60 | "keep_capability", 61 | "setgroups", 62 | "setgid", 63 | "setuid", 64 | "dup3", 65 | "fcntl", 66 | "setsid", 67 | "ioctl", 68 | "mount(root)", 69 | "mount(tmpfs)", 70 | "mount(chdir)", 71 | "mount", 72 | "mount(mkdir)", 73 | "pivot_root", 74 | "umount", 75 | "unlink", 76 | "mount(readonly)", 77 | "chdir", 78 | "setrlimt", 79 | "set_no_new_privs", 80 | "drop_capability", 81 | "set_cap", 82 | "ptrace_me", 83 | "stop", 84 | "seccomp", 85 | "sync_write", 86 | "sync_read", 87 | "execve", 88 | } 89 | 90 | func (e ErrorLocation) String() string { 91 | if e >= LocClone && e <= LocExecve { 92 | return locToString[e] 93 | } 94 | return "unknown" 95 | } 96 | 97 | func (e ChildError) Error() string { 98 | if e.Index > 0 { 99 | return fmt.Sprintf("%s(%d): %s", e.Location.String(), e.Index, e.Err.Error()) 100 | } 101 | return fmt.Sprintf("%s: %s", e.Location.String(), e.Err.Error()) 102 | } 103 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_child_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | ) 7 | 8 | // Reference to src/syscall/exec_darwin.go 9 | //go:norace 10 | func forkAndExecInChild(r *Runner, argv0 *byte, argv, env []*byte, workdir, profile *byte, p [2]int) (r1 uintptr, err1 syscall.Errno) { 11 | var ( 12 | err2 syscall.Errno 13 | errBuf *byte 14 | ) 15 | 16 | // similar to exec_linux, avoid side effect by shuffling around 17 | fd, nextfd := prepareFds(r.Files) 18 | pipe := p[1] 19 | 20 | // About to call fork. 21 | // No more allocation or calls of non-assembly functions. 22 | beforeFork() 23 | 24 | // UnshareFlags (new namespaces) is activated by clone syscall 25 | r1, _, err1 = rawSyscall(libc_fork_trampoline_addr, 0, 0, 0) 26 | if err1 != 0 || r1 != 0 { 27 | // in parent process, immediate return 28 | return 29 | } 30 | 31 | // In child process 32 | afterForkInChild() 33 | // Notice: cannot call any GO functions beyond this point 34 | 35 | // Close write end of pipe 36 | if _, _, err1 = rawSyscall(libc_close_trampoline_addr, uintptr(p[0]), 0, 0); err1 != 0 { 37 | goto childerror 38 | } 39 | 40 | // Set pg id 41 | _, _, err1 = rawSyscall(libc_setpgid_trampoline_addr, 0, 0, 0) 42 | if err1 != 0 { 43 | goto childerror 44 | } 45 | 46 | // Pass 1 & pass 2 assigns fds for child process 47 | // Pass 1: fd[i] < i => nextfd 48 | if pipe < nextfd { 49 | _, _, err1 = rawSyscall(libc_dup2_trampoline_addr, uintptr(pipe), uintptr(nextfd), 0) 50 | if err1 != 0 { 51 | goto childerror 52 | } 53 | rawSyscall(libc_fcntl_trampoline_addr, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 54 | pipe = nextfd 55 | nextfd++ 56 | } 57 | for i := 0; i < len(fd); i++ { 58 | if fd[i] >= 0 && fd[i] < int(i) { 59 | // Avoid fd rewrite 60 | if nextfd == pipe { 61 | nextfd++ 62 | } 63 | _, _, err1 = rawSyscall(libc_dup2_trampoline_addr, uintptr(fd[i]), uintptr(nextfd), 0) 64 | if err1 != 0 { 65 | goto childerror 66 | } 67 | rawSyscall(libc_fcntl_trampoline_addr, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 68 | // Set up close on exec 69 | fd[i] = nextfd 70 | nextfd++ 71 | } 72 | } 73 | // Pass 2: fd[i] => i 74 | for i := 0; i < len(fd); i++ { 75 | if fd[i] == -1 { 76 | rawSyscall(libc_close_trampoline_addr, uintptr(i), 0, 0) 77 | continue 78 | } 79 | if fd[i] == int(i) { 80 | // dup2(i, i) will not clear close on exec flag, need to reset the flag 81 | _, _, err1 = rawSyscall(libc_fcntl_trampoline_addr, uintptr(fd[i]), syscall.F_SETFD, 0) 82 | if err1 != 0 { 83 | goto childerror 84 | } 85 | continue 86 | } 87 | _, _, err1 = rawSyscall(libc_dup2_trampoline_addr, uintptr(fd[i]), uintptr(i), 0) 88 | if err1 != 0 { 89 | goto childerror 90 | } 91 | } 92 | 93 | // chdir for child 94 | if workdir != nil { 95 | _, _, err1 = rawSyscall(libc_chdir_trampoline_addr, uintptr(unsafe.Pointer(workdir)), 0, 0) 96 | if err1 != 0 { 97 | goto childerror 98 | } 99 | } 100 | 101 | // Set limit 102 | for _, rlim := range r.RLimits { 103 | _, _, err1 = rawSyscall(libc_setrlimit_trampoline_addr, uintptr(rlim.Res), uintptr(unsafe.Pointer(&rlim.Rlim)), 0) 104 | if err1 != 0 { 105 | if err1 == syscall.EINVAL && (rlim.Res == syscall.RLIMIT_DATA || rlim.Res == syscall.RLIMIT_AS) { 106 | continue 107 | } 108 | goto childerror 109 | } 110 | } 111 | 112 | // Load sandbox profile 113 | if profile != nil { 114 | r1, _, err1 = rawSyscall(libc_sandbox_init_trampoline_addr, uintptr(unsafe.Pointer(profile)), 0, uintptr(unsafe.Pointer(&errBuf))) 115 | if err1 != 0 { 116 | goto childerror 117 | } 118 | if r1 != 0 { 119 | err1 = 253 120 | goto childerror 121 | } 122 | rawSyscall(libc_sandbox_free_error_trampoline_addr, uintptr(unsafe.Pointer(errBuf)), 0, 0) 123 | } 124 | 125 | // Sync before exec 126 | err2 = 0 127 | r1, _, err1 = rawSyscall(libc_write_trampoline_addr, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 128 | if r1 == 0 || err1 != 0 { 129 | goto childerror 130 | } 131 | 132 | r1, _, err1 = rawSyscall(libc_read_trampoline_addr, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 133 | if r1 == 0 || err1 != 0 { 134 | goto childerror 135 | } 136 | 137 | // Time to exec. 138 | _, _, err1 = rawSyscall(libc_execve_trampoline_addr, 139 | uintptr(unsafe.Pointer(argv0)), 140 | uintptr(unsafe.Pointer(&argv[0])), 141 | uintptr(unsafe.Pointer(&env[0]))) 142 | 143 | childerror: 144 | // send error code on pipe 145 | rawSyscall(libc_write_trampoline_addr, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) 146 | for { 147 | rawSyscall(libc_exit_trampoline_addr, uintptr(err1+err2), 0, 0) 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | 7 | "golang.org/x/sys/unix" 8 | ) 9 | 10 | // Start will fork, load seccomp and execve and being traced by ptrace 11 | // Return pid and potential error 12 | // The runtime OS thread must be locked before calling this function 13 | // if ptrace is set to true 14 | func (r *Runner) Start() (int, error) { 15 | argv0, argv, env, err := prepareExec(r.Args, r.Env) 16 | if err != nil { 17 | return 0, err 18 | } 19 | 20 | // prepare work dir 21 | workdir, err := syscallStringFromString(r.WorkDir) 22 | if err != nil { 23 | return 0, err 24 | } 25 | 26 | // prepare sandbox profile 27 | profile, err := syscallStringFromString(r.SandboxProfile) 28 | if err != nil { 29 | return 0, err 30 | } 31 | 32 | // ensure the socketpair created did not leak to child 33 | syscall.ForkLock.Lock() 34 | 35 | // socketpair p is also used to sync with parent before final execve 36 | // p[0] is used by parent and p[1] is used by child 37 | var p [2]int 38 | if err := forkExecSocketPair(&p); err != nil { 39 | syscall.ForkLock.Unlock() 40 | return 0, err 41 | } 42 | 43 | // fork in child 44 | pid, err1 := forkAndExecInChild(r, argv0, argv, env, workdir, profile, p) 45 | 46 | // restore all signals 47 | afterFork() 48 | 49 | syscall.ForkLock.Unlock() 50 | 51 | return syncWithChild(r, p, int(pid), err1) 52 | } 53 | 54 | func forkExecSocketPair(p *[2]int) error { 55 | var err error 56 | *p, err = syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM, 0) 57 | if err != nil { 58 | return err 59 | } 60 | _, err = fcntl(p[0], syscall.F_SETFD, syscall.FD_CLOEXEC) 61 | if err != nil { 62 | return err 63 | } 64 | _, err = fcntl(p[1], syscall.F_SETFD, syscall.FD_CLOEXEC) 65 | if err != nil { 66 | return err 67 | } 68 | return nil 69 | } 70 | 71 | func syncWithChild(r *Runner, p [2]int, pid int, err1 syscall.Errno) (int, error) { 72 | var ( 73 | r1 uintptr 74 | err2 syscall.Errno 75 | err error 76 | ) 77 | 78 | // sync with child 79 | unix.Close(p[1]) 80 | 81 | // clone syscall failed 82 | if err1 != 0 { 83 | unix.Close(p[0]) 84 | return 0, syscall.Errno(err1) 85 | } 86 | r1, _, err1 = syscall3(libc_read_trampoline_addr, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2))) 87 | // child returned error code 88 | if r1 != unsafe.Sizeof(err2) || err2 != 0 || err1 != 0 { 89 | err = handlePipeError(r1, err2) 90 | goto fail 91 | } 92 | 93 | // if syncfunc return error, then fail child immediately 94 | if r.SyncFunc != nil { 95 | if err = r.SyncFunc(int(pid)); err != nil { 96 | goto fail 97 | } 98 | } 99 | // otherwise, ack child (err1 == 0) 100 | r1, _, err1 = syscall3(libc_write_trampoline_addr, uintptr(p[0]), uintptr(unsafe.Pointer(&err1)), uintptr(unsafe.Sizeof(err1))) 101 | if err1 != 0 { 102 | goto fail 103 | } 104 | 105 | // if read anything mean child failed after sync (close_on_exec so it should not block) 106 | r1, _, err1 = syscall3(libc_read_trampoline_addr, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2))) 107 | unix.Close(p[0]) 108 | if r1 != 0 || err1 != 0 { 109 | err = handlePipeError(r1, err2) 110 | goto failAfterClose 111 | } 112 | return int(pid), nil 113 | 114 | fail: 115 | unix.Close(p[0]) 116 | 117 | failAfterClose: 118 | handleChildFailed(int(pid)) 119 | return 0, err 120 | } 121 | 122 | // check pipe error 123 | func handlePipeError(r1 uintptr, errno syscall.Errno) error { 124 | if r1 == unsafe.Sizeof(errno) { 125 | return syscall.Errno(errno) 126 | } 127 | return syscall.EPIPE 128 | } 129 | 130 | func handleChildFailed(pid int) { 131 | var wstatus syscall.WaitStatus 132 | // make sure not blocked 133 | syscall.Kill(pid, syscall.SIGKILL) 134 | // child failed; wait for it to exit, to make sure the zombies don't accumulate 135 | _, err := syscall.Wait4(pid, &wstatus, 0, nil) 136 | for err == syscall.EINTR { 137 | _, err = syscall.Wait4(pid, &wstatus, 0, nil) 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" // required for go:linkname. 6 | 7 | "golang.org/x/sys/unix" 8 | ) 9 | 10 | // Start will fork, load seccomp and execve and being traced by ptrace 11 | // Return pid and potential error 12 | // The runtime OS thread must be locked before calling this function 13 | // if ptrace is set to true 14 | func (r *Runner) Start() (int, error) { 15 | argv0, argv, env, err := prepareExec(r.Args, r.Env) 16 | if err != nil { 17 | return 0, err 18 | } 19 | 20 | // prepare work dir 21 | workdir, err := syscallStringFromString(r.WorkDir) 22 | if err != nil { 23 | return 0, err 24 | } 25 | 26 | // prepare hostname 27 | hostname, err := syscallStringFromString(r.HostName) 28 | if err != nil { 29 | return 0, err 30 | } 31 | 32 | // prepare domainname 33 | domainname, err := syscallStringFromString(r.DomainName) 34 | if err != nil { 35 | return 0, err 36 | } 37 | 38 | // prepare pivot_root param 39 | pivotRoot, err := syscallStringFromString(r.PivotRoot) 40 | if err != nil { 41 | return 0, err 42 | } 43 | 44 | // socketpair p used to notify child the uid / gid mapping have been setup 45 | // socketpair p is also used to sync with parent before final execve 46 | // p[0] is used by parent and p[1] is used by child 47 | p, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) 48 | if err != nil { 49 | return 0, err 50 | } 51 | 52 | // fork in child 53 | pid, err1 := forkAndExecInChild(r, argv0, argv, env, workdir, hostname, domainname, pivotRoot, p) 54 | 55 | // restore all signals 56 | afterFork() 57 | syscall.ForkLock.Unlock() 58 | 59 | return syncWithChild(r, p, int(pid), err1) 60 | } 61 | 62 | func syncWithChild(r *Runner, p [2]int, pid int, err1 syscall.Errno) (int, error) { 63 | var ( 64 | err2 syscall.Errno 65 | err error 66 | unshareUser = r.CloneFlags&unix.CLONE_NEWUSER == unix.CLONE_NEWUSER 67 | childErr ChildError 68 | n int 69 | ) 70 | 71 | // sync with child 72 | unix.Close(p[1]) 73 | 74 | // clone syscall failed 75 | if err1 != 0 { 76 | unix.Close(p[0]) 77 | childErr.Location = LocClone 78 | childErr.Err = err1 79 | return 0, childErr 80 | } 81 | 82 | // synchronize with child for uid / gid map 83 | if unshareUser { 84 | if err = writeIDMaps(r, int(pid)); err != nil { 85 | err2 = err.(syscall.Errno) 86 | } 87 | syscall.RawSyscall(syscall.SYS_WRITE, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2))) 88 | } 89 | 90 | // if syncfunc return error, then fail child immediately 91 | // only sync if there is a syncFunc 92 | if r.SyncFunc != nil { 93 | n, err = readChildErr(p[0], &childErr) 94 | // child returned error code 95 | if (n != int(unsafe.Sizeof(err2)) && n != int(unsafe.Sizeof(childErr))) || childErr.Err != 0 || err != nil { 96 | childErr.Err = handlePipeError(n, childErr.Err) 97 | goto fail 98 | } 99 | if err = r.SyncFunc(int(pid)); err != nil { 100 | goto fail 101 | } 102 | // otherwise, ack child (err1 == 0) 103 | syscall.RawSyscall(syscall.SYS_WRITE, uintptr(p[0]), uintptr(unsafe.Pointer(&err1)), uintptr(unsafe.Sizeof(err1))) 104 | } 105 | 106 | // if stopped before execve by signal SIGSTOP or PTRACE_ME, then do not wait until execve 107 | if r.StopBeforeSeccomp || (r.Seccomp != nil && r.Ptrace) { 108 | // let's wait it in another goroutine to avoid SIGPIPE 109 | go func() { 110 | readChildErr(p[0], &childErr) 111 | unix.Close(p[0]) 112 | }() 113 | return int(pid), nil 114 | } 115 | 116 | // if read anything mean child failed after sync (close_on_exec so it should not block) 117 | n, err = readChildErr(p[0], &childErr) 118 | unix.Close(p[0]) 119 | if n != 0 || err != nil { 120 | childErr.Err = handlePipeError(n, childErr.Err) 121 | goto failAfterClose 122 | } 123 | return int(pid), nil 124 | 125 | fail: 126 | unix.Close(p[0]) 127 | 128 | failAfterClose: 129 | handleChildFailed(int(pid)) 130 | if childErr.Err == 0 { 131 | return 0, err 132 | } 133 | return 0, childErr 134 | } 135 | 136 | func readChildErr(fd int, childErr *ChildError) (n int, err error) { 137 | for { 138 | n, err = readlen(fd, (*byte)(unsafe.Pointer(childErr)), int(unsafe.Sizeof(*childErr))) 139 | if err != syscall.EINTR { 140 | break 141 | } 142 | } 143 | return 144 | } 145 | 146 | // https://cs.opensource.google/go/go/+/refs/tags/go1.18.1:src/syscall/zsyscall_linux_amd64.go;l=944 147 | func readlen(fd int, p *byte, np int) (n int, err error) { 148 | r0, _, e1 := syscall.Syscall(syscall.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np)) 149 | n = int(r0) 150 | if e1 != 0 { 151 | err = syscall.Errno(e1) 152 | } 153 | return 154 | } 155 | 156 | // check pipe error 157 | func handlePipeError(r1 int, errno syscall.Errno) syscall.Errno { 158 | if uintptr(r1) >= unsafe.Sizeof(errno) { 159 | return syscall.Errno(errno) 160 | } 161 | return syscall.EPIPE 162 | } 163 | 164 | func handleChildFailed(pid int) { 165 | var wstatus syscall.WaitStatus 166 | // make sure not blocked 167 | syscall.Kill(pid, syscall.SIGKILL) 168 | // child failed; wait for it to exit, to make sure the zombies don't accumulate 169 | _, err := syscall.Wait4(pid, &wstatus, 0, nil) 170 | for err == syscall.EINTR { 171 | _, err = syscall.Wait4(pid, &wstatus, 0, nil) 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_linux_test.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "os" 5 | "syscall" 6 | "testing" 7 | 8 | "github.com/criyle/go-sandbox/pkg/mount" 9 | ) 10 | 11 | func TestFork_DropCaps(t *testing.T) { 12 | t.Parallel() 13 | r := Runner{ 14 | Args: []string{"/bin/echo"}, 15 | CloneFlags: syscall.CLONE_NEWUSER, 16 | DropCaps: true, 17 | } 18 | _, err := r.Start() 19 | if err != nil { 20 | t.Fatal(err) 21 | } 22 | } 23 | 24 | func TestFork_ETXTBSY(t *testing.T) { 25 | f, err := os.CreateTemp("", "") 26 | if err != nil { 27 | t.Fatal(err) 28 | } 29 | t.Cleanup(func() { 30 | os.Remove(f.Name()) 31 | f.Close() 32 | }) 33 | 34 | if err := f.Chmod(0777); err != nil { 35 | t.Fatal(err) 36 | } 37 | 38 | echo, err := os.Open("/bin/echo") 39 | if err != nil { 40 | t.Fatal(err) 41 | } 42 | defer echo.Close() 43 | 44 | _, err = f.ReadFrom(echo) 45 | if err != nil { 46 | t.Fatal(err) 47 | } 48 | 49 | r := Runner{ 50 | Args: []string{f.Name()}, 51 | ExecFile: f.Fd(), 52 | } 53 | _, err = r.Start() 54 | e, ok := err.(ChildError) 55 | if !ok { 56 | t.Fatalf("not a child error") 57 | } 58 | if e.Err != syscall.ETXTBSY && e.Location != LocExecve && e.Index != 0 { 59 | t.Fatal(err) 60 | } 61 | } 62 | 63 | func TestFork_OK(t *testing.T) { 64 | t.Parallel() 65 | f, err := os.CreateTemp("", "") 66 | if err != nil { 67 | t.Fatal(err) 68 | } 69 | defer os.Remove(f.Name()) 70 | 71 | if err := f.Chmod(0777); err != nil { 72 | t.Fatal(err) 73 | } 74 | 75 | echo, err := os.Open("/bin/echo") 76 | if err != nil { 77 | t.Fatal(err) 78 | } 79 | defer echo.Close() 80 | 81 | _, err = f.ReadFrom(echo) 82 | if err != nil { 83 | t.Fatal(err) 84 | } 85 | f.Close() 86 | 87 | r := Runner{ 88 | Args: []string{f.Name()}, 89 | } 90 | _, err = r.Start() 91 | if err != nil { 92 | t.Fatal(err) 93 | } 94 | } 95 | 96 | func TestFork_ENOENT(t *testing.T) { 97 | t.Parallel() 98 | m, err := mount.NewBuilder(). 99 | WithMount( 100 | mount.Mount{ 101 | Source: "NOT_EXISTS", 102 | }).Build() 103 | if err != nil { 104 | t.Fatal(err) 105 | } 106 | r := Runner{ 107 | Args: []string{"/bin/echo"}, 108 | CloneFlags: syscall.CLONE_NEWNS | syscall.CLONE_NEWUSER, 109 | Mounts: m, 110 | } 111 | _, err = r.Start() 112 | e, ok := err.(ChildError) 113 | if !ok { 114 | t.Fatalf("not a child error") 115 | } 116 | if e.Err != syscall.ENOENT && e.Location != LocExecve { 117 | t.Fatal(err) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_unix.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import _ "unsafe" // to use go:linkname 4 | 5 | //go:linkname beforeFork syscall.runtime_BeforeFork 6 | func beforeFork() 7 | 8 | //go:linkname afterFork syscall.runtime_AfterFork 9 | func afterFork() 10 | 11 | //go:linkname afterForkInChild syscall.runtime_AfterForkInChild 12 | func afterForkInChild() 13 | -------------------------------------------------------------------------------- /pkg/forkexec/fork_util.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | ) 6 | 7 | // prepareExec prepares execve parameters 8 | func prepareExec(Args, Env []string) (*byte, []*byte, []*byte, error) { 9 | // make exec args0 10 | argv0, err := syscall.BytePtrFromString(Args[0]) 11 | if err != nil { 12 | return nil, nil, nil, err 13 | } 14 | // make exec args 15 | argv, err := syscall.SlicePtrFromStrings(Args) 16 | if err != nil { 17 | return nil, nil, nil, err 18 | } 19 | // make env 20 | env, err := syscall.SlicePtrFromStrings(Env) 21 | if err != nil { 22 | return nil, nil, nil, err 23 | } 24 | return argv0, argv, env, nil 25 | } 26 | 27 | // prepareFds prepares fd array 28 | func prepareFds(files []uintptr) ([]int, int) { 29 | fd := make([]int, len(files)) 30 | nextfd := len(files) 31 | for i, ufd := range files { 32 | if nextfd < int(ufd) { 33 | nextfd = int(ufd) 34 | } 35 | fd[i] = int(ufd) 36 | } 37 | nextfd++ 38 | return fd, nextfd 39 | } 40 | 41 | // syscallStringFromString prepares *byte if string is not empty, other wise nil 42 | func syscallStringFromString(str string) (*byte, error) { 43 | if str != "" { 44 | return syscall.BytePtrFromString(str) 45 | } 46 | return nil, nil 47 | } 48 | -------------------------------------------------------------------------------- /pkg/forkexec/runner_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "github.com/criyle/go-sandbox/pkg/rlimit" 5 | ) 6 | 7 | // Runner is the configuration including the exec path, argv 8 | // and resource limits. 9 | type Runner struct { 10 | // argv and env for execve syscall for the child process 11 | Args []string 12 | Env []string 13 | 14 | // POSIX Resource limit set by set rlimit 15 | RLimits []rlimit.RLimit 16 | 17 | // file descriptors map for new process, from 0 to len - 1 18 | Files []uintptr 19 | 20 | // work path set by chdir(dir) (current working directory for child) 21 | // if pivot_root is defined, this will execute after changed to new root 22 | WorkDir string 23 | 24 | // sandbox profile defines the sandbox profile for sandbox_init syscall 25 | SandboxProfile string 26 | 27 | // Parent and child process with sync status through a socket pair. 28 | // SyncFunc will invoke with the child pid. If SyncFunc return some error, 29 | // parent will signal child to stop and report the error 30 | // SyncFunc is called right before execve, thus it could track cpu more accurately 31 | SyncFunc func(int) error 32 | } 33 | -------------------------------------------------------------------------------- /pkg/forkexec/runner_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | 6 | "github.com/criyle/go-sandbox/pkg/mount" 7 | "github.com/criyle/go-sandbox/pkg/rlimit" 8 | ) 9 | 10 | // Runner is the configuration including the exec path, argv 11 | // and resource limits. It can creates tracee for ptrace-based tracer. 12 | // It can also create unshared process in another namespace 13 | type Runner struct { 14 | // argv and env for execve syscall for the child process 15 | Args []string 16 | Env []string 17 | 18 | // if exec_fd is defined, then at the end, fd_execve is called 19 | ExecFile uintptr 20 | 21 | // POSIX Resource limit set by set rlimit 22 | RLimits []rlimit.RLimit 23 | 24 | // file descriptors map for new process, from 0 to len - 1 25 | Files []uintptr 26 | 27 | // work path set by chdir(dir) (current working directory for child) 28 | // if pivot_root is defined, this will execute after changed to new root 29 | WorkDir string 30 | 31 | // seccomp syscall filter applied to child 32 | Seccomp *syscall.SockFprog 33 | 34 | // clone unshare flag to create linux namespace, effective when clone child 35 | // since unshare syscall does not join the new pid group 36 | CloneFlags uintptr 37 | 38 | // mounts defines the mount syscalls after unshare mount namespace 39 | // need CAP_SYS_ADMIN inside the namespace (e.g. unshare user namespace) 40 | // if pivot root is provided, relative target is better for chdir-mount meta 41 | // and pivot root will mount as tmpfs before any mount 42 | Mounts []mount.SyscallParams 43 | 44 | // pivot_root defines a readonly new root after unshare mount namespace 45 | // it should be a directory in absolute path and should used with mounts 46 | // Call path: 47 | // mount("tmpfs", root, "tmpfs", 0, nil) 48 | // chdir(root) 49 | // [do mounts] 50 | // mkdir("old_root") 51 | // pivot_root(root, "old_root") 52 | // umount("old_root", MNT_DETACH) 53 | // rmdir("old_root") 54 | // mount("tmpfs", "/", "tmpfs", MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOATIME | MS_NOSUID, nil) 55 | PivotRoot string 56 | 57 | // HostName and DomainName to be set after unshare UTS & user (CAP_SYS_ADMIN) 58 | HostName, DomainName string 59 | 60 | // UidMappings / GidMappings for unshared user namespaces, no-op if mapping is null 61 | UIDMappings []syscall.SysProcIDMap 62 | GIDMappings []syscall.SysProcIDMap 63 | 64 | // CgroupFd to use when clone3 with CLONE_INTO_CGROUP with kernel >=5.7 and cgroup v2 65 | CgroupFd uintptr 66 | 67 | // Credential holds user and group identities to be assumed 68 | // by a child process started by StartProcess. 69 | Credential *syscall.Credential 70 | 71 | // Parent and child process with sync status through a socket pair. 72 | // SyncFunc will invoke with the child pid. If SyncFunc return some error, 73 | // parent will signal child to stop and report the error 74 | // SyncFunc is called right before execve, thus it could track cpu more accurately 75 | SyncFunc func(int) error 76 | 77 | // ptrace controls child process to call ptrace(PTRACE_TRACEME) 78 | // runtime.LockOSThread is required for tracer to call ptrace syscalls 79 | Ptrace bool 80 | 81 | // no_new_privs calls prctl(PR_SET_NO_NEW_PRIVS) to 0 to disable calls to 82 | // setuid processes. It is automatically enabled when seccomp filter is provided 83 | NoNewPrivs bool 84 | 85 | // stop before seccomp calls kill(getpid(), SIGSTOP) to wait for tracer to continue 86 | // right before the calls to seccomp. It is automatically enabled when seccomp 87 | // filter and ptrace are provided since kill might not be available after 88 | // seccomp and execve might be traced by ptrace 89 | // cannot stop after seccomp since kill might not be allowed by seccomp filter 90 | StopBeforeSeccomp bool 91 | 92 | // GidMappingsEnableSetgroups allows / disallows setgroups syscall. 93 | // deny if GIDMappings is nil 94 | GIDMappingsEnableSetgroups bool 95 | 96 | // drop_caps calls cap_set(self, 0) to drop all capabilities 97 | // from effective, permitted, inheritable capability sets before execve 98 | // it should avoid calls to set ambient capabilities 99 | DropCaps bool 100 | 101 | // UnshareCgroupAfterSync specifies whether to unshare cgroup namespace after 102 | // sync (the syncFunc might be add the child to the cgroup) 103 | UnshareCgroupAfterSync bool 104 | 105 | // CTTY specifies if set the fd 0 as controlling TTY 106 | CTTY bool 107 | } 108 | -------------------------------------------------------------------------------- /pkg/forkexec/sandbox_darwin_test.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func TestWrite(t *testing.T) { 9 | c, err := os.ReadFile("test.sb") 10 | if err != nil { 11 | t.Error(err) 12 | return 13 | } 14 | 15 | // before load profile, it is ok 16 | f, err := os.OpenFile("/tmp/sandbox_test", os.O_CREATE|os.O_RDWR, 0777) 17 | if err != nil { 18 | t.Error(err) 19 | return 20 | } 21 | f.Close() 22 | 23 | if err = SandboxLoadProfile(string(c)); err != nil { 24 | t.Error(err) 25 | return 26 | } 27 | 28 | // after is not ok 29 | f, err = os.OpenFile("/tmp/sandbox_test", os.O_CREATE|os.O_RDWR, 0777) 30 | if !os.IsPermission(err) { 31 | t.Error(err) 32 | return 33 | } 34 | f.Close() 35 | } 36 | -------------------------------------------------------------------------------- /pkg/forkexec/sandbox_load_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "syscall" 7 | "unsafe" 8 | ) 9 | 10 | func goString(b *byte) string { 11 | l := 0 12 | sb := (*[1 << 20]byte)(unsafe.Pointer(b)) 13 | for sb[l] > 0 { 14 | l++ 15 | } 16 | return string(sb[: l-1 : l-1]) 17 | } 18 | 19 | // SandboxLoadProfile loads profile by sandbox_init 20 | func SandboxLoadProfile(profile string) (err error) { 21 | var errBuf *byte 22 | p, err := syscall.BytePtrFromString(profile) 23 | if err != nil { 24 | return 25 | } 26 | if err := SandboxInit(p, 0, &errBuf); err != nil { 27 | defer SandboxFreeError(errBuf) 28 | if errBuf != nil { 29 | s := goString(errBuf) 30 | return os.NewSyscallError("sandbox_init", errors.New(s)) 31 | } 32 | return os.NewSyscallError("sandbox_init", err) 33 | } 34 | return 35 | } 36 | -------------------------------------------------------------------------------- /pkg/forkexec/syscall_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | _ "unsafe" // use go:linkname 6 | ) 7 | 8 | //go:linkname syscall3 syscall.syscall 9 | func syscall3(fn, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 10 | 11 | //go:linkname rawSyscall syscall.rawSyscall 12 | func rawSyscall(fn, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 13 | 14 | //go:linkname rawSyscall6 syscall.rawSyscall6 15 | func rawSyscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 16 | 17 | var libc_fork_trampoline_addr uintptr 18 | 19 | var libc_close_trampoline_addr uintptr 20 | 21 | var libc_read_trampoline_addr uintptr 22 | 23 | var libc_write_trampoline_addr uintptr 24 | 25 | var libc_fcntl_trampoline_addr uintptr 26 | 27 | var libc_dup2_trampoline_addr uintptr 28 | 29 | var libc_chdir_trampoline_addr uintptr 30 | 31 | var libc_setrlimit_trampoline_addr uintptr 32 | 33 | var libc_execve_trampoline_addr uintptr 34 | 35 | var libc_exit_trampoline_addr uintptr 36 | 37 | var libc_setpgid_trampoline_addr uintptr 38 | 39 | //go:linkname fcntl syscall.fcntl 40 | func fcntl(fd int, cmd int, arg int) (val int, err error) 41 | -------------------------------------------------------------------------------- /pkg/forkexec/test.sb: -------------------------------------------------------------------------------- 1 | ; Test Sandbox Profile 2 | ; No network / socket 3 | ; No system / sysctl 4 | (version 1) 5 | 6 | (deny default) 7 | 8 | ; allow posix ipc 9 | (allow ipc-posix*) 10 | 11 | ; allow file access / 12 | (allow file-read* (subpath "/usr/lib")) 13 | 14 | ; allow execve 15 | (allow process-exec) 16 | 17 | ; allow fork 18 | (allow process-fork) 19 | 20 | ; allow signal to self 21 | (allow signal (target self)) 22 | -------------------------------------------------------------------------------- /pkg/forkexec/userns_linux.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "strconv" 5 | "syscall" 6 | 7 | "golang.org/x/sys/unix" 8 | ) 9 | 10 | // writeUidGidMappings writes User ID and Group ID mappings for user namespaces 11 | // for a process and it is called from the parent process. 12 | func writeIDMaps(r *Runner, pid int) error { 13 | var uidMappings, gidMappings, setGroups []byte 14 | pidStr := strconv.Itoa(pid) 15 | 16 | if r.UIDMappings == nil { 17 | uidMappings = []byte("0 " + strconv.Itoa(unix.Geteuid()) + " 1") 18 | } else { 19 | uidMappings = formatIDMappings(r.UIDMappings) 20 | } 21 | if err := writeFile("/proc/"+pidStr+"/uid_map", uidMappings); err != nil { 22 | return err 23 | } 24 | 25 | if r.GIDMappings == nil || !r.GIDMappingsEnableSetgroups { 26 | setGroups = setGIDDeny 27 | } else { 28 | setGroups = setGIDAllow 29 | } 30 | if err := writeFile("/proc/"+pidStr+"/setgroups", setGroups); err != nil { 31 | return err 32 | } 33 | 34 | if r.GIDMappings == nil { 35 | gidMappings = []byte("0 " + strconv.Itoa(unix.Getegid()) + " 1") 36 | } else { 37 | gidMappings = formatIDMappings(r.GIDMappings) 38 | } 39 | if err := writeFile("/proc/"+pidStr+"/gid_map", gidMappings); err != nil { 40 | return err 41 | } 42 | return nil 43 | } 44 | 45 | func formatIDMappings(idMap []syscall.SysProcIDMap) []byte { 46 | var data []byte 47 | for _, im := range idMap { 48 | data = append(data, []byte(strconv.Itoa(im.ContainerID)+" "+strconv.Itoa(im.HostID)+" "+strconv.Itoa(im.Size)+"\n")...) 49 | } 50 | return data 51 | } 52 | 53 | // writeFile writes file 54 | func writeFile(path string, content []byte) error { 55 | fd, err := unix.Open(path, unix.O_RDWR|unix.O_CLOEXEC, 0) 56 | if err != nil { 57 | return err 58 | } 59 | if _, err := unix.Write(fd, content); err != nil { 60 | unix.Close(fd) 61 | return err 62 | } 63 | if err := unix.Close(fd); err != nil { 64 | return err 65 | } 66 | return nil 67 | } 68 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_386.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // See ../runtime/sys_linux_386.s for the reason why we always use int 0x80 8 | // instead of the glibc-specific "CALL 0x10(GS)". 9 | #define INVOKE_SYSCALL INT $0x80 10 | 11 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 12 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-24 13 | MOVL trap+0(FP), AX // syscall entry 14 | MOVL a1+4(FP), BX 15 | MOVL a2+8(FP), CX 16 | MOVL a3+12(FP), DX 17 | POPL SI // preserve return address 18 | INVOKE_SYSCALL 19 | PUSHL SI 20 | CMPL AX, $0xfffff001 21 | JLS ok 22 | MOVL $-1, r1+16(FP) 23 | NEGL AX 24 | MOVL AX, err+20(FP) 25 | RET 26 | ok: 27 | MOVL AX, r1+16(FP) 28 | MOVL $0, err+20(FP) 29 | RET 30 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-48 9 | MOVQ a1+8(FP), DI 10 | MOVQ a2+16(FP), SI 11 | MOVQ a3+24(FP), DX 12 | MOVQ $0, R10 13 | MOVQ $0, R8 14 | MOVQ $0, R9 15 | MOVQ trap+0(FP), AX // syscall entry 16 | POPQ R12 // preserve return address 17 | SYSCALL 18 | PUSHQ R12 19 | CMPQ AX, $0xfffffffffffff001 20 | JLS ok2 21 | MOVQ $-1, r1+32(FP) 22 | NEGQ AX 23 | MOVQ AX, err+40(FP) 24 | RET 25 | ok2: 26 | MOVQ AX, r1+32(FP) 27 | MOVQ $0, err+40(FP) 28 | RET 29 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_arm.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-24 9 | MOVW trap+0(FP), R7 // syscall entry 10 | MOVW a1+4(FP), R0 11 | MOVW a2+8(FP), R1 12 | MOVW a3+12(FP), R2 13 | SWI $0 14 | MOVW $0xfffff001, R1 15 | CMP R1, R0 16 | BLS ok 17 | MOVW $-1, R1 18 | MOVW R1, r1+16(FP) 19 | RSB $0, R0, R0 20 | MOVW R0, err+20(FP) 21 | RET 22 | ok: 23 | MOVW R0, r1+16(FP) 24 | MOVW $0, R0 25 | MOVW R0, err+20(FP) 26 | RET 27 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_arm64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT,$0-48 9 | MOVD a1+8(FP), R0 10 | MOVD a2+16(FP), R1 11 | MOVD a3+24(FP), R2 12 | MOVD $0, R3 13 | MOVD $0, R4 14 | MOVD $0, R5 15 | MOVD trap+0(FP), R8 // syscall entry 16 | SVC 17 | CMN $4095, R0 18 | BCC ok 19 | MOVD $-1, R4 20 | MOVD R4, r1+32(FP) // r1 21 | NEG R0, R0 22 | MOVD R0, err+40(FP) // errno 23 | RET 24 | ok: 25 | MOVD R0, r1+32(FP) // r1 26 | MOVD ZR, err+40(FP) // errno 27 | RET 28 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_loong64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT,$0-48 9 | MOVV a1+8(FP), R4 10 | MOVV a2+16(FP), R5 11 | MOVV a3+24(FP), R6 12 | MOVV $0, R7 13 | MOVV $0, R8 14 | MOVV $0, R9 15 | MOVV trap+0(FP), R11 // syscall entry 16 | SYSCALL 17 | MOVW $-4096, R12 18 | BGEU R12, R4, ok 19 | MOVV $-1, R12 20 | MOVV R12, r1+32(FP) // r1 21 | SUBVU R4, R0, R4 22 | MOVV R4, err+40(FP) // errno 23 | RET 24 | ok: 25 | MOVV R4, r1+32(FP) // r1 26 | MOVV R0, err+40(FP) // errno 27 | RET 28 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_mips64x.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build linux && (mips64 || mips64le) 6 | 7 | #include "textflag.h" 8 | 9 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 10 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-48 11 | MOVV a1+8(FP), R4 12 | MOVV a2+16(FP), R5 13 | MOVV a3+24(FP), R6 14 | MOVV R0, R7 15 | MOVV R0, R8 16 | MOVV R0, R9 17 | MOVV trap+0(FP), R2 // syscall entry 18 | SYSCALL 19 | BEQ R7, ok 20 | MOVV $-1, R1 21 | MOVV R1, r1+32(FP) // r1 22 | MOVV R2, err+40(FP) // errno 23 | RET 24 | ok: 25 | MOVV R2, r1+32(FP) // r1 26 | MOVV R0, err+40(FP) // errno 27 | RET 28 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_mipsx.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build linux && (mips || mipsle) 6 | 7 | #include "textflag.h" 8 | 9 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 10 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-24 11 | MOVW a1+4(FP), R4 12 | MOVW a2+8(FP), R5 13 | MOVW a3+12(FP), R6 14 | MOVW trap+0(FP), R2 // syscall entry 15 | SYSCALL 16 | BEQ R7, ok 17 | MOVW $-1, R1 18 | MOVW R1, r1+16(FP) // r1 19 | MOVW R2, err+20(FP) // errno 20 | RET 21 | ok: 22 | MOVW R2, r1+16(FP) // r1 23 | MOVW R0, err+20(FP) // errno 24 | RET 25 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_ppc64x.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build linux && (ppc64 || ppc64le) 6 | 7 | #include "textflag.h" 8 | 9 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 10 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-48 11 | MOVD a1+8(FP), R3 12 | MOVD a2+16(FP), R4 13 | MOVD a3+24(FP), R5 14 | MOVD R0, R6 15 | MOVD R0, R7 16 | MOVD R0, R8 17 | MOVD trap+0(FP), R9 // syscall entry 18 | SYSCALL R9 19 | BVC ok 20 | MOVD $-1, R4 21 | MOVD R4, r1+32(FP) // r1 22 | MOVD R3, err+40(FP) // errno 23 | RET 24 | ok: 25 | MOVD R3, r1+32(FP) // r1 26 | MOVD R0, err+40(FP) // errno 27 | RET 28 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_riscv64.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-48 9 | MOV a1+8(FP), A0 10 | MOV a2+16(FP), A1 11 | MOV a3+24(FP), A2 12 | MOV ZERO, A3 13 | MOV ZERO, A4 14 | MOV ZERO, A5 15 | MOV trap+0(FP), A7 // syscall entry 16 | ECALL 17 | MOV $-4096, T0 18 | BLTU T0, A0, err 19 | MOV A0, r1+32(FP) // r1 20 | MOV ZERO, err+40(FP) // errno 21 | RET 22 | err: 23 | MOV $-1, T0 24 | MOV T0, r1+32(FP) // r1 25 | SUB A0, ZERO, A0 26 | MOV A0, err+40(FP) // errno 27 | RET 28 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/asm_linux_s390x.s: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "textflag.h" 6 | 7 | // func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1, err uintptr) 8 | TEXT ·RawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-48 9 | MOVD a1+8(FP), R2 10 | MOVD a2+16(FP), R3 11 | MOVD a3+24(FP), R4 12 | MOVD $0, R5 13 | MOVD $0, R6 14 | MOVD $0, R7 15 | MOVD trap+0(FP), R1 // syscall entry 16 | SYSCALL 17 | MOVD $0xfffffffffffff001, R8 18 | CMPUBLT R2, R8, ok2 19 | MOVD $-1, r1+32(FP) 20 | NEG R2, R2 21 | MOVD R2, err+40(FP) // errno 22 | RET 23 | ok2: 24 | MOVD R2, r1+32(FP) 25 | MOVD $0, err+40(FP) // errno 26 | RET 27 | -------------------------------------------------------------------------------- /pkg/forkexec/vfork/syscall.go: -------------------------------------------------------------------------------- 1 | // Package vfork provides the mirror of the un-exported syscall.rawVforkSyscall. 2 | // The assembly code is copied from go1.24 syscall package 3 | package vfork 4 | 5 | import "syscall" 6 | 7 | // RawVforkSyscall provided the mirrored version from un-exported syscall.rawVforkSyscall 8 | // The go:linkname does not work for assembly function and it was suggested by the go team 9 | // to copy over the assembly functions 10 | // 11 | // See go.dev/issue/71892 12 | func RawVforkSyscall(trap, a1, a2, a3 uintptr) (r1 uintptr, err syscall.Errno) 13 | -------------------------------------------------------------------------------- /pkg/forkexec/zsyscall_darwin.go: -------------------------------------------------------------------------------- 1 | package forkexec 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | ) 7 | 8 | // SandboxInit calls sandbox_init 9 | func SandboxInit(profile *byte, flags uint64, errorBuf **byte) (err error) { 10 | var r1 uintptr 11 | r1, _, err = syscall3(libc_sandbox_init_trampoline_addr, uintptr(unsafe.Pointer(profile)), uintptr(flags), uintptr(unsafe.Pointer(errorBuf))) 12 | if r1 != 0 { 13 | err = syscall.EINVAL 14 | } else { 15 | err = nil 16 | } 17 | return 18 | } 19 | 20 | // SandboxFreeError calls sandbox_free_error 21 | func SandboxFreeError(errorBuf *byte) { 22 | syscall3(libc_sandbox_free_error_trampoline_addr, uintptr(unsafe.Pointer(errorBuf)), 0, 0) 23 | } 24 | 25 | var libc_sandbox_init_trampoline_addr uintptr 26 | 27 | //go:cgo_import_dynamic libc_sandbox_init sandbox_init "/usr/lib/libSystem.B.dylib" 28 | 29 | var libc_sandbox_free_error_trampoline_addr uintptr 30 | 31 | //go:cgo_import_dynamic libc_sandbox_free_error sandbox_free_error "/usr/lib/libSystem.B.dylib" 32 | -------------------------------------------------------------------------------- /pkg/forkexec/zsyscall_darwin.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | TEXT libc_sandbox_init_trampoline<>(SB),NOSPLIT,$0-0 4 | JMP libc_sandbox_init(SB) 5 | 6 | GLOBL ·libc_sandbox_init_trampoline_addr(SB), RODATA, $8 7 | DATA ·libc_sandbox_init_trampoline_addr(SB)/8, $libc_sandbox_init_trampoline<>(SB) 8 | 9 | TEXT libc_sandbox_free_error_trampoline<>(SB),NOSPLIT,$0-0 10 | JMP libc_sandbox_free_error(SB) 11 | 12 | GLOBL ·libc_sandbox_free_error_trampoline_addr(SB), RODATA, $8 13 | DATA ·libc_sandbox_free_error_trampoline_addr(SB)/8, $libc_sandbox_free_error_trampoline<>(SB) 14 | 15 | TEXT libc_fork_trampoline<>(SB),NOSPLIT,$0-0 16 | JMP libc_fork(SB) 17 | 18 | GLOBL ·libc_fork_trampoline_addr(SB), RODATA, $8 19 | DATA ·libc_fork_trampoline_addr(SB)/8, $libc_fork_trampoline<>(SB) 20 | 21 | TEXT libc_close_trampoline<>(SB),NOSPLIT,$0-0 22 | JMP libc_close(SB) 23 | 24 | GLOBL ·libc_close_trampoline_addr(SB), RODATA, $8 25 | DATA ·libc_close_trampoline_addr(SB)/8, $libc_close_trampoline<>(SB) 26 | 27 | TEXT libc_read_trampoline<>(SB),NOSPLIT,$0-0 28 | JMP libc_read(SB) 29 | 30 | GLOBL ·libc_read_trampoline_addr(SB), RODATA, $8 31 | DATA ·libc_read_trampoline_addr(SB)/8, $libc_read_trampoline<>(SB) 32 | 33 | TEXT libc_write_trampoline<>(SB),NOSPLIT,$0-0 34 | JMP libc_write(SB) 35 | 36 | GLOBL ·libc_write_trampoline_addr(SB), RODATA, $8 37 | DATA ·libc_write_trampoline_addr(SB)/8, $libc_write_trampoline<>(SB) 38 | 39 | TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 40 | JMP libc_fcntl(SB) 41 | 42 | GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 43 | DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) 44 | 45 | TEXT libc_dup2_trampoline<>(SB),NOSPLIT,$0-0 46 | JMP libc_dup2(SB) 47 | 48 | GLOBL ·libc_dup2_trampoline_addr(SB), RODATA, $8 49 | DATA ·libc_dup2_trampoline_addr(SB)/8, $libc_dup2_trampoline<>(SB) 50 | 51 | TEXT libc_chdir_trampoline<>(SB),NOSPLIT,$0-0 52 | JMP libc_chdir(SB) 53 | 54 | GLOBL ·libc_chdir_trampoline_addr(SB), RODATA, $8 55 | DATA ·libc_chdir_trampoline_addr(SB)/8, $libc_chdir_trampoline<>(SB) 56 | 57 | TEXT libc_setrlimit_trampoline<>(SB),NOSPLIT,$0-0 58 | JMP libc_setrlimit(SB) 59 | 60 | GLOBL ·libc_setrlimit_trampoline_addr(SB), RODATA, $8 61 | DATA ·libc_setrlimit_trampoline_addr(SB)/8, $libc_setrlimit_trampoline<>(SB) 62 | 63 | TEXT libc_execve_trampoline<>(SB),NOSPLIT,$0-0 64 | JMP libc_execve(SB) 65 | 66 | GLOBL ·libc_execve_trampoline_addr(SB), RODATA, $8 67 | DATA ·libc_execve_trampoline_addr(SB)/8, $libc_execve_trampoline<>(SB) 68 | 69 | TEXT libc_exit_trampoline<>(SB),NOSPLIT,$0-0 70 | JMP libc_exit(SB) 71 | 72 | GLOBL ·libc_exit_trampoline_addr(SB), RODATA, $8 73 | DATA ·libc_exit_trampoline_addr(SB)/8, $libc_exit_trampoline<>(SB) 74 | 75 | TEXT libc_setpgid_trampoline<>(SB),NOSPLIT,$0-0 76 | JMP libc_setpgid(SB) 77 | 78 | GLOBL ·libc_setpgid_trampoline_addr(SB), RODATA, $8 79 | DATA ·libc_setpgid_trampoline_addr(SB)/8, $libc_setpgid_trampoline<>(SB) 80 | -------------------------------------------------------------------------------- /pkg/memfd/doc.go: -------------------------------------------------------------------------------- 1 | // Package memfd provides interface to Linux memfd to create and seal a memory file. 2 | // Requires kernel >= 3.17 3 | package memfd 4 | -------------------------------------------------------------------------------- /pkg/memfd/memfd_linux.go: -------------------------------------------------------------------------------- 1 | package memfd 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | 8 | "golang.org/x/sys/unix" 9 | ) 10 | 11 | const createFlag = unix.MFD_CLOEXEC | unix.MFD_ALLOW_SEALING 12 | const roSeal = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE 13 | 14 | // New creates a new memfd, caller need to close the file 15 | func New(name string) (*os.File, error) { 16 | fd, err := unix.MemfdCreate(name, createFlag) 17 | if err != nil { 18 | return nil, fmt.Errorf("memfd: memfd_create: %w", err) 19 | } 20 | file := os.NewFile(uintptr(fd), name) 21 | if file == nil { 22 | unix.Close(fd) 23 | return nil, fmt.Errorf("memfd: new file failed for %q", name) 24 | } 25 | return file, nil 26 | } 27 | 28 | // DupToMemfd reads content from reader to sealed (readonly) memfd for given name 29 | func DupToMemfd(name string, reader io.Reader) (*os.File, error) { 30 | file, err := New(name) 31 | if err != nil { 32 | return nil, fmt.Errorf("memfd: dup: %w", err) 33 | } 34 | // linux syscall sendfile might be more efficient here if reader is a file 35 | if _, err = file.ReadFrom(reader); err != nil { 36 | file.Close() 37 | return nil, fmt.Errorf("memfd: read from: %w", err) 38 | } 39 | // make memfd readonly 40 | if _, err = unix.FcntlInt(file.Fd(), unix.F_ADD_SEALS, roSeal); err != nil { 41 | file.Close() 42 | return nil, fmt.Errorf("memfd: seal: %w", err) 43 | } 44 | if _, err := file.Seek(0, 0); err != nil { 45 | file.Close() 46 | return nil, fmt.Errorf("memfd: seek: %w", err) 47 | } 48 | return file, nil 49 | } 50 | -------------------------------------------------------------------------------- /pkg/memfd/memfd_linux_test.go: -------------------------------------------------------------------------------- 1 | package memfd 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "os" 7 | "testing" 8 | ) 9 | 10 | func TestNew(t *testing.T) { 11 | f, err := New("test-memfd") 12 | if err != nil { 13 | t.Fatalf("New() error: %v", err) 14 | } 15 | defer f.Close() 16 | 17 | // Write and read to verify it's a valid file 18 | data := []byte("hello world") 19 | n, err := f.Write(data) 20 | if err != nil { 21 | t.Fatalf("Write error: %v", err) 22 | } 23 | if n != len(data) { 24 | t.Errorf("Write n = %d, want %d", n, len(data)) 25 | } 26 | _, err = f.Seek(0, io.SeekStart) 27 | if err != nil { 28 | t.Fatalf("Seek error: %v", err) 29 | } 30 | read := make([]byte, len(data)) 31 | n, err = f.Read(read) 32 | if err != nil && err != io.EOF { 33 | t.Fatalf("Read error: %v", err) 34 | } 35 | if string(read[:n]) != string(data) { 36 | t.Errorf("Read = %q, want %q", string(read[:n]), string(data)) 37 | } 38 | } 39 | 40 | func TestDupToMemfd(t *testing.T) { 41 | content := []byte("memfd content") 42 | r := bytes.NewReader(content) 43 | f, err := DupToMemfd("dup-memfd", r) 44 | if err != nil { 45 | t.Fatalf("DupToMemfd error: %v", err) 46 | } 47 | defer f.Close() 48 | 49 | // Should be sealed (readonly), so writing should fail 50 | _, err = f.Write([]byte("fail")) 51 | if err == nil { 52 | t.Error("expected write to sealed memfd to fail, but it succeeded") 53 | } 54 | 55 | // Should be able to read the content 56 | _, err = f.Seek(0, io.SeekStart) 57 | if err != nil { 58 | t.Fatalf("Seek error: %v", err) 59 | } 60 | got, err := io.ReadAll(f) 61 | if err != nil { 62 | t.Fatalf("ReadAll error: %v", err) 63 | } 64 | if string(got) != string(content) { 65 | t.Errorf("ReadAll = %q, want %q", string(got), string(content)) 66 | } 67 | } 68 | 69 | func TestDupToMemfd_ErrorPropagation(t *testing.T) { 70 | // Pass a reader that always errors 71 | r := errorReader{} 72 | _, err := DupToMemfd("dup-memfd-err", r) 73 | if err == nil { 74 | t.Error("expected error from DupToMemfd, got nil") 75 | } 76 | } 77 | 78 | type errorReader struct{} 79 | 80 | func (errorReader) Read([]byte) (int, error) { return 0, os.ErrInvalid } 81 | -------------------------------------------------------------------------------- /pkg/memfd/memfd_other.go: -------------------------------------------------------------------------------- 1 | //go:build !linux 2 | 3 | package memfd 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "os" 9 | "runtime" 10 | ) 11 | 12 | var errNotImplemented = fmt.Errorf("memfd: unsupported on platform: %s", runtime.GOOS) 13 | 14 | func New(name string) (*os.File, error) { 15 | return nil, errNotImplemented 16 | } 17 | 18 | func DupToMemfd(name string, reader io.Reader) (*os.File, error) { 19 | return nil, errNotImplemented 20 | } 21 | -------------------------------------------------------------------------------- /pkg/mount/builder.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | // Builder builds fork_exec friendly mount syscall format 4 | type Builder struct { 5 | Mounts []Mount 6 | } 7 | 8 | // NewBuilder creates new mount builder instance 9 | func NewBuilder() *Builder { 10 | return &Builder{} 11 | } 12 | -------------------------------------------------------------------------------- /pkg/mount/builder_linux.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | 7 | "golang.org/x/sys/unix" 8 | ) 9 | 10 | const ( 11 | bind = unix.MS_BIND | unix.MS_NOSUID | unix.MS_PRIVATE | unix.MS_REC 12 | mFlag = unix.MS_NOSUID | unix.MS_NOATIME | unix.MS_NODEV 13 | ) 14 | 15 | // NewDefaultBuilder creates default builder for minimal rootfs 16 | func NewDefaultBuilder() *Builder { 17 | return NewBuilder(). 18 | WithBind("/usr", "usr", true). 19 | WithBind("/lib", "lib", true). 20 | WithBind("/lib64", "lib64", true). 21 | WithBind("/bin", "bin", true) 22 | } 23 | 24 | // Build creates sequence of syscalls for fork_exec 25 | func (b *Builder) Build() ([]SyscallParams, error) { 26 | var err error 27 | ret := make([]SyscallParams, 0, len(b.Mounts)) 28 | for _, m := range b.Mounts { 29 | var mknod bool 30 | if mknod, err = isBindMountFileOrNotExists(m); err != nil { 31 | return nil, err 32 | } 33 | sp, err := m.ToSyscall() 34 | if err != nil { 35 | return nil, err 36 | } 37 | sp.MakeNod = mknod 38 | ret = append(ret, *sp) 39 | } 40 | return ret, nil 41 | } 42 | 43 | // FilterNotExist removes bind mount that does not exists 44 | func (b *Builder) FilterNotExist() *Builder { 45 | rt := b.Mounts[:0] 46 | for _, m := range b.Mounts { 47 | if m.IsBindMount() { 48 | if _, err := os.Stat(m.Source); os.IsNotExist(err) { 49 | continue 50 | } 51 | } 52 | rt = append(rt, m) 53 | } 54 | b.Mounts = rt 55 | return b 56 | } 57 | 58 | func isBindMountFileOrNotExists(m Mount) (bool, error) { 59 | if m.IsBindMount() { 60 | if fi, err := os.Stat(m.Source); os.IsNotExist(err) { 61 | return false, err 62 | } else if !fi.IsDir() { 63 | return true, err 64 | } 65 | } 66 | return false, nil 67 | } 68 | 69 | // WithMounts adds mounts to builder 70 | func (b *Builder) WithMounts(m []Mount) *Builder { 71 | b.Mounts = append(b.Mounts, m...) 72 | return b 73 | } 74 | 75 | // WithMount adds single mount to builder 76 | func (b *Builder) WithMount(m Mount) *Builder { 77 | b.Mounts = append(b.Mounts, m) 78 | return b 79 | } 80 | 81 | // WithBind adds a bind mount to builder 82 | func (b *Builder) WithBind(source, target string, readonly bool) *Builder { 83 | var flags uintptr = bind 84 | if readonly { 85 | flags |= unix.MS_RDONLY 86 | } 87 | b.Mounts = append(b.Mounts, Mount{ 88 | Source: source, 89 | Target: target, 90 | Flags: flags, 91 | }) 92 | return b 93 | } 94 | 95 | // WithTmpfs adds a tmpfs mount to builder 96 | func (b *Builder) WithTmpfs(target, data string) *Builder { 97 | b.Mounts = append(b.Mounts, Mount{ 98 | Source: "tmpfs", 99 | Target: target, 100 | FsType: "tmpfs", 101 | Flags: mFlag, 102 | Data: data, 103 | }) 104 | return b 105 | } 106 | 107 | // WithProc adds proc file system mounted read-only 108 | func (b *Builder) WithProc() *Builder { 109 | return b.WithProcRW(false) 110 | } 111 | 112 | // WithProcRW adds proc file system, possibly read-write 113 | func (b *Builder) WithProcRW(canWrite bool) *Builder { 114 | var flags uintptr = unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC 115 | if !canWrite { 116 | flags |= unix.MS_RDONLY 117 | } 118 | b.Mounts = append(b.Mounts, Mount{ 119 | Source: "proc", 120 | Target: "proc", 121 | FsType: "proc", 122 | Flags: flags, 123 | }) 124 | return b 125 | } 126 | 127 | func (b Builder) String() string { 128 | var sb strings.Builder 129 | sb.WriteString("Mounts: ") 130 | for i, m := range b.Mounts { 131 | sb.WriteString(m.String()) 132 | if i != len(b.Mounts)-1 { 133 | sb.WriteString(", ") 134 | } 135 | } 136 | return sb.String() 137 | } 138 | -------------------------------------------------------------------------------- /pkg/mount/builder_linux_test.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func TestBuilder_WithBind(t *testing.T) { 10 | b := NewBuilder().WithBind("/src", "/dst", true) 11 | if len(b.Mounts) != 1 { 12 | t.Fatalf("expected 1 mount, got %d", len(b.Mounts)) 13 | } 14 | m := b.Mounts[0] 15 | if m.Source != "/src" || m.Target != "/dst" { 16 | t.Errorf("unexpected mount: %+v", m) 17 | } 18 | if !m.IsBindMount() { 19 | t.Errorf("expected bind mount") 20 | } 21 | if !m.IsReadOnly() { 22 | t.Errorf("expected readonly mount") 23 | } 24 | } 25 | 26 | func TestBuilder_WithTmpfs(t *testing.T) { 27 | b := NewBuilder().WithTmpfs("/tmp", "size=64m") 28 | if len(b.Mounts) != 1 { 29 | t.Fatalf("expected 1 mount, got %d", len(b.Mounts)) 30 | } 31 | m := b.Mounts[0] 32 | if !m.IsTmpFs() { 33 | t.Errorf("expected tmpfs mount") 34 | } 35 | if m.Target != "/tmp" || m.Data != "size=64m" { 36 | t.Errorf("unexpected mount: %+v", m) 37 | } 38 | } 39 | 40 | func TestBuilder_WithProc(t *testing.T) { 41 | b := NewBuilder().WithProc() 42 | if len(b.Mounts) != 1 { 43 | t.Fatalf("expected 1 mount, got %d", len(b.Mounts)) 44 | } 45 | m := b.Mounts[0] 46 | if m.FsType != "proc" { 47 | t.Errorf("expected proc fsType") 48 | } 49 | if !m.IsReadOnly() { 50 | t.Errorf("expected readonly proc mount") 51 | } 52 | } 53 | 54 | func TestBuilder_WithProcRW(t *testing.T) { 55 | b := NewBuilder().WithProcRW(true) 56 | if len(b.Mounts) != 1 { 57 | t.Fatalf("expected 1 mount, got %d", len(b.Mounts)) 58 | } 59 | m := b.Mounts[0] 60 | if m.FsType != "proc" { 61 | t.Errorf("expected proc fsType") 62 | } 63 | if m.IsReadOnly() { 64 | t.Errorf("expected read-write proc mount") 65 | } 66 | } 67 | 68 | func TestBuilder_WithMounts(t *testing.T) { 69 | m1 := Mount{Source: "/a", Target: "/b"} 70 | m2 := Mount{Source: "/c", Target: "/d"} 71 | b := NewBuilder().WithMounts([]Mount{m1, m2}) 72 | if len(b.Mounts) != 2 { 73 | t.Fatalf("expected 2 mounts, got %d", len(b.Mounts)) 74 | } 75 | } 76 | 77 | func TestBuilder_WithMount(t *testing.T) { 78 | m := Mount{Source: "/a", Target: "/b"} 79 | b := NewBuilder().WithMount(m) 80 | if len(b.Mounts) != 1 { 81 | t.Fatalf("expected 1 mount, got %d", len(b.Mounts)) 82 | } 83 | } 84 | 85 | func TestBuilder_String(t *testing.T) { 86 | b := NewBuilder(). 87 | WithBind("/src", "/dst", false). 88 | WithTmpfs("/tmp", "size=1m"). 89 | WithProc() 90 | s := b.String() 91 | if !strings.HasPrefix(s, "Mounts: ") { 92 | t.Errorf("unexpected prefix: %q", s) 93 | } 94 | if !strings.Contains(s, "bind[/src:/dst:rw]") { 95 | t.Errorf("missing bind: %q", s) 96 | } 97 | if !strings.Contains(s, "tmpfs[/tmp]") { 98 | t.Errorf("missing tmpfs: %q", s) 99 | } 100 | if !strings.Contains(s, "proc[ro]") { 101 | t.Errorf("missing proc: %q", s) 102 | } 103 | } 104 | 105 | func TestBuilder_FilterNotExist(t *testing.T) { 106 | tmpDir := t.TempDir() 107 | tmpFilePath := tmpDir + "/mounttest" 108 | f, err := os.Create(tmpFilePath) 109 | if err != nil { 110 | t.Fatal(err) 111 | } 112 | f.Close() 113 | 114 | b := NewBuilder(). 115 | WithBind(f.Name(), "/dst1", false). 116 | WithBind("/not/exist", "/dst2", false) 117 | b.FilterNotExist() 118 | if len(b.Mounts) != 1 { 119 | t.Errorf("expected 1 mount after filter, got %d", len(b.Mounts)) 120 | } 121 | if b.Mounts[0].Source != f.Name() { 122 | t.Errorf("unexpected mount: %+v", b.Mounts[0]) 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /pkg/mount/doc.go: -------------------------------------------------------------------------------- 1 | // Package mount provides general data structure for mount and mount namespace (multiple mounts) definition. 2 | package mount 3 | -------------------------------------------------------------------------------- /pkg/mount/mount.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | import ( 4 | "syscall" 5 | ) 6 | 7 | // Mount defines syscall for mount points 8 | type Mount struct { 9 | Source, Target, FsType, Data string 10 | Flags uintptr 11 | } 12 | 13 | // SyscallParams defines the raw syscall arguments to mount 14 | type SyscallParams struct { 15 | Source, Target, FsType, Data *byte 16 | Flags uintptr 17 | Prefixes []*byte 18 | MakeNod bool 19 | } 20 | 21 | // ToSyscall convert Mount to SyscallPrams 22 | func (m *Mount) ToSyscall() (*SyscallParams, error) { 23 | var data *byte 24 | source, err := syscall.BytePtrFromString(m.Source) 25 | if err != nil { 26 | return nil, err 27 | } 28 | target, err := syscall.BytePtrFromString(m.Target) 29 | if err != nil { 30 | return nil, err 31 | } 32 | fsType, err := syscall.BytePtrFromString(m.FsType) 33 | if err != nil { 34 | return nil, err 35 | } 36 | if m.Data != "" { 37 | data, err = syscall.BytePtrFromString(m.Data) 38 | if err != nil { 39 | return nil, err 40 | } 41 | } 42 | prefix := pathPrefix(m.Target) 43 | paths, err := arrayPtrFromStrings(prefix) 44 | if err != nil { 45 | return nil, err 46 | } 47 | return &SyscallParams{ 48 | Source: source, 49 | Target: target, 50 | FsType: fsType, 51 | Flags: m.Flags, 52 | Data: data, 53 | Prefixes: paths, 54 | }, nil 55 | } 56 | 57 | // pathPrefix get all components from path 58 | func pathPrefix(path string) []string { 59 | ret := make([]string, 0) 60 | for i := 1; i < len(path); i++ { 61 | if path[i] == '/' { 62 | ret = append(ret, path[:i]) 63 | } 64 | } 65 | ret = append(ret, path) 66 | return ret 67 | } 68 | 69 | // arrayPtrFromStrings converts strings to c style strings 70 | func arrayPtrFromStrings(str []string) ([]*byte, error) { 71 | bytes := make([]*byte, 0, len(str)) 72 | for _, s := range str { 73 | b, err := syscall.BytePtrFromString(s) 74 | if err != nil { 75 | return nil, err 76 | } 77 | bytes = append(bytes, b) 78 | } 79 | return bytes, nil 80 | } 81 | -------------------------------------------------------------------------------- /pkg/mount/mount_linux.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "syscall" 8 | ) 9 | 10 | // Mount calls mount syscall 11 | func (m *Mount) Mount() error { 12 | if err := ensureMountTargetExists(m.Source, m.Target); err != nil { 13 | return fmt.Errorf("mkdir: %w", err) 14 | } 15 | if err := syscall.Mount(m.Source, m.Target, m.FsType, m.Flags, m.Data); err != nil { 16 | return fmt.Errorf("mount: %w", err) 17 | } 18 | // Read-only bind mount need to be remounted 19 | const bindRo = syscall.MS_BIND | syscall.MS_RDONLY 20 | if m.Flags&bindRo == bindRo { 21 | if err := syscall.Mount("", m.Target, m.FsType, m.Flags|syscall.MS_REMOUNT, m.Data); err != nil { 22 | return fmt.Errorf("remount: %w", err) 23 | } 24 | } 25 | return nil 26 | } 27 | 28 | // IsBindMount returns if it is a bind mount 29 | func (m Mount) IsBindMount() bool { 30 | return m.Flags&syscall.MS_BIND == syscall.MS_BIND 31 | } 32 | 33 | // IsReadOnly returns if it is a readonly mount 34 | func (m Mount) IsReadOnly() bool { 35 | return m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY 36 | } 37 | 38 | // IsTmpFs returns if the fsType is tmpfs 39 | func (m Mount) IsTmpFs() bool { 40 | return m.FsType == "tmpfs" 41 | } 42 | 43 | func ensureMountTargetExists(source, target string) error { 44 | isFile := false 45 | if fi, err := os.Stat(source); err == nil { 46 | isFile = !fi.IsDir() 47 | } 48 | dir := target 49 | if isFile { 50 | dir = filepath.Dir(target) 51 | } 52 | if err := os.MkdirAll(dir, 0755); err != nil { 53 | return err 54 | } 55 | if isFile { 56 | if err := syscall.Mknod(target, 0755, 0); err != nil { 57 | // double check if file exists 58 | f, err1 := os.Lstat(target) 59 | if err1 == nil && f.Mode().IsRegular() { 60 | return nil 61 | } 62 | return err 63 | } 64 | } 65 | return nil 66 | } 67 | 68 | func (m Mount) String() string { 69 | flag := "rw" 70 | if m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY { 71 | flag = "ro" 72 | } 73 | switch { 74 | case m.Flags&syscall.MS_BIND == syscall.MS_BIND: 75 | return fmt.Sprintf("bind[%s:%s:%s]", m.Source, m.Target, flag) 76 | 77 | case m.FsType == "tmpfs": 78 | return fmt.Sprintf("tmpfs[%s]", m.Target) 79 | 80 | case m.FsType == "proc": 81 | return fmt.Sprintf("proc[%s]", flag) 82 | 83 | default: 84 | return fmt.Sprintf("mount[%s,%s:%s:%x,%s]", m.FsType, m.Source, m.Target, m.Flags, m.Data) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /pkg/mount/mount_linux_test.go: -------------------------------------------------------------------------------- 1 | package mount 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "syscall" 7 | "testing" 8 | ) 9 | 10 | func TestMount_IsBindMount(t *testing.T) { 11 | m := Mount{Flags: syscall.MS_BIND} 12 | if !m.IsBindMount() { 13 | t.Errorf("expected IsBindMount true") 14 | } 15 | m.Flags = 0 16 | if m.IsBindMount() { 17 | t.Errorf("expected IsBindMount false") 18 | } 19 | } 20 | 21 | func TestMount_IsReadOnly(t *testing.T) { 22 | m := Mount{Flags: syscall.MS_RDONLY} 23 | if !m.IsReadOnly() { 24 | t.Errorf("expected IsReadOnly true") 25 | } 26 | m.Flags = 0 27 | if m.IsReadOnly() { 28 | t.Errorf("expected IsReadOnly false") 29 | } 30 | } 31 | 32 | func TestMount_IsTmpFs(t *testing.T) { 33 | m := Mount{FsType: "tmpfs"} 34 | if !m.IsTmpFs() { 35 | t.Errorf("expected IsTmpFs true") 36 | } 37 | m.FsType = "other" 38 | if m.IsTmpFs() { 39 | t.Errorf("expected IsTmpFs false") 40 | } 41 | } 42 | 43 | func TestMount_String(t *testing.T) { 44 | tests := []struct { 45 | m Mount 46 | want string 47 | }{ 48 | { 49 | m: Mount{Source: "/src", Target: "/dst", Flags: syscall.MS_BIND, FsType: "", Data: ""}, 50 | want: "bind[/src:/dst:rw]", 51 | }, 52 | { 53 | m: Mount{Source: "/src", Target: "/dst", Flags: syscall.MS_BIND | syscall.MS_RDONLY, FsType: "", Data: ""}, 54 | want: "bind[/src:/dst:ro]", 55 | }, 56 | { 57 | m: Mount{Source: "", Target: "/tmp", FsType: "tmpfs"}, 58 | want: "tmpfs[/tmp]", 59 | }, 60 | { 61 | m: Mount{Source: "", Target: "proc", FsType: "proc", Flags: syscall.MS_RDONLY}, 62 | want: "proc[ro]", 63 | }, 64 | { 65 | m: Mount{Source: "src", Target: "dst", FsType: "other", Flags: 0, Data: "data"}, 66 | want: "mount[other,src:dst:0,data]", 67 | }, 68 | } 69 | for _, tt := range tests { 70 | got := tt.m.String() 71 | if got != tt.want { 72 | t.Errorf("Mount.String() = %q, want %q", got, tt.want) 73 | } 74 | } 75 | } 76 | 77 | func TestEnsureMountTargetExists_Dir(t *testing.T) { 78 | tmpDir := t.TempDir() 79 | target := filepath.Join(tmpDir, "foo/bar") 80 | err := ensureMountTargetExists(tmpDir, target) 81 | if err != nil { 82 | t.Fatalf("ensureMountTargetExists error: %v", err) 83 | } 84 | info, err := os.Stat(target) 85 | if err != nil { 86 | t.Fatalf("stat error: %v", err) 87 | } 88 | if !info.IsDir() { 89 | t.Errorf("expected directory at %s", target) 90 | } 91 | } 92 | 93 | func TestEnsureMountTargetExists_File(t *testing.T) { 94 | tmpDir := t.TempDir() 95 | srcFile := filepath.Join(tmpDir, "srcfile") 96 | if err := os.WriteFile(srcFile, []byte("x"), 0644); err != nil { 97 | t.Fatalf("write srcfile: %v", err) 98 | } 99 | target := filepath.Join(tmpDir, "targetfile") 100 | err := ensureMountTargetExists(srcFile, target) 101 | if err != nil { 102 | t.Fatalf("ensureMountTargetExists error: %v", err) 103 | } 104 | // Should be a file or at least exist 105 | info, err := os.Lstat(target) 106 | if err != nil { 107 | t.Fatalf("lstat error: %v", err) 108 | } 109 | if info.IsDir() { 110 | t.Errorf("expected file at %s, got directory", target) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /pkg/pipe/buffer.go: -------------------------------------------------------------------------------- 1 | // Package pipe provides a wrapper to create a pipe and 2 | // collect at most max bytes from the reader side 3 | package pipe 4 | 5 | import ( 6 | "bytes" 7 | "fmt" 8 | "io" 9 | "os" 10 | ) 11 | 12 | // Buffer is used to create a writable pipe and read 13 | // at most max bytes to a buffer 14 | type Buffer struct { 15 | W *os.File 16 | Buffer *bytes.Buffer 17 | Done <-chan struct{} 18 | Max int64 19 | } 20 | 21 | // NewPipe create a pipe with a goroutine to copy its read-end to writer 22 | // returns the write end and signal for finish 23 | // caller need to close w 24 | func NewPipe(writer io.Writer, n int64) (<-chan struct{}, *os.File, error) { 25 | r, w, err := os.Pipe() 26 | if err != nil { 27 | return nil, nil, err 28 | } 29 | done := make(chan struct{}) 30 | go func() { 31 | io.CopyN(writer, r, int64(n)) 32 | close(done) 33 | // ensure no blocking / SIGPIPE on the other end 34 | io.Copy(io.Discard, r) 35 | r.Close() 36 | }() 37 | return done, w, nil 38 | } 39 | 40 | // NewBuffer creates a os pipe, caller need to 41 | // caller need to close w 42 | // Notice: if rely on done for finish, w need be closed in parent process 43 | func NewBuffer(max int64) (*Buffer, error) { 44 | buffer := new(bytes.Buffer) 45 | done, w, err := NewPipe(buffer, max+1) 46 | if err != nil { 47 | return nil, err 48 | } 49 | return &Buffer{ 50 | W: w, 51 | Max: max, 52 | Buffer: buffer, 53 | Done: done, 54 | }, nil 55 | } 56 | 57 | func (b Buffer) String() string { 58 | return fmt.Sprintf("Buffer[%d/%d]", b.Buffer.Len(), b.Max) 59 | } 60 | -------------------------------------------------------------------------------- /pkg/pipe/buffer_test.go: -------------------------------------------------------------------------------- 1 | package pipe 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func TestNewBuffer_WriteAndRead(t *testing.T) { 11 | const max = 10 12 | buf, err := NewBuffer(max) 13 | if err != nil { 14 | t.Fatalf("NewBuffer error: %v", err) 15 | } 16 | defer buf.W.Close() 17 | 18 | // Write less than max bytes 19 | input := "hello" 20 | n, err := buf.W.Write([]byte(input)) 21 | if err != nil { 22 | t.Fatalf("Write error: %v", err) 23 | } 24 | if n != len(input) { 25 | t.Errorf("Write bytes = %d, want %d", n, len(input)) 26 | } 27 | buf.W.Close() 28 | <-buf.Done 29 | 30 | got := buf.Buffer.String() 31 | if got != input { 32 | t.Errorf("Buffer content = %q, want %q", got, input) 33 | } 34 | } 35 | 36 | func TestNewBuffer_MaxBytes(t *testing.T) { 37 | const max = 5 38 | buf, err := NewBuffer(max) 39 | if err != nil { 40 | t.Fatalf("NewBuffer error: %v", err) 41 | } 42 | defer buf.W.Close() 43 | 44 | // Write more than max bytes 45 | input := "toolonginput" 46 | _, err = io.Copy(buf.W, strings.NewReader(input)) 47 | if err != nil { 48 | t.Fatalf("Copy error: %v", err) 49 | } 50 | buf.W.Close() 51 | <-buf.Done 52 | 53 | got := buf.Buffer.String() 54 | if len(got) != int(max+1) { 55 | t.Errorf("Buffer length = %d, want %d", len(got), max+1) 56 | } 57 | if got != input[:max+1] { 58 | t.Errorf("Buffer content = %q, want %q", got, input[:max+1]) 59 | } 60 | } 61 | 62 | func TestBuffer_String(t *testing.T) { 63 | const max = 8 64 | buf, err := NewBuffer(max) 65 | if err != nil { 66 | t.Fatalf("NewBuffer error: %v", err) 67 | } 68 | defer buf.W.Close() 69 | 70 | _, _ = buf.W.Write([]byte("abc")) 71 | buf.W.Close() 72 | <-buf.Done 73 | 74 | want := "Buffer[3/8]" 75 | if buf.String() != want { 76 | t.Errorf("String() = %q, want %q", buf.String(), want) 77 | } 78 | } 79 | 80 | func TestNewBuffer_DoneCloses(t *testing.T) { 81 | const max = 4 82 | buf, err := NewBuffer(max) 83 | if err != nil { 84 | t.Fatalf("NewBuffer error: %v", err) 85 | } 86 | defer buf.W.Close() 87 | 88 | done := make(chan struct{}) 89 | go func() { 90 | _, _ = buf.W.Write([]byte("test")) 91 | buf.W.Close() 92 | close(done) 93 | }() 94 | 95 | select { 96 | case <-buf.Done: 97 | // ok 98 | case <-time.After(1 * time.Second): 99 | t.Fatal("timeout waiting for Done channel") 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /pkg/rlimit/rlimit.go: -------------------------------------------------------------------------------- 1 | // Package rlimit provides data structure for resource limits by setrlimit syscall on linux. 2 | package rlimit 3 | 4 | import ( 5 | "fmt" 6 | "strings" 7 | "syscall" 8 | 9 | "github.com/criyle/go-sandbox/runner" 10 | ) 11 | 12 | // RLimits defines the rlimit applied by setrlimit syscall to traced process 13 | type RLimits struct { 14 | CPU uint64 // in s 15 | CPUHard uint64 // in s 16 | Data uint64 // in bytes 17 | FileSize uint64 // in bytes 18 | Stack uint64 // in bytes 19 | AddressSpace uint64 // in bytes 20 | OpenFile uint64 // count 21 | DisableCore bool // set core to 0 22 | } 23 | 24 | // RLimit is the resource limits defined by Linux setrlimit 25 | type RLimit struct { 26 | // Res is the resource type (e.g. syscall.RLIMIT_CPU) 27 | Res int 28 | // Rlim is the limit applied to that resource 29 | Rlim syscall.Rlimit 30 | } 31 | 32 | func getRlimit(cur, max uint64) syscall.Rlimit { 33 | return syscall.Rlimit{Cur: cur, Max: max} 34 | } 35 | 36 | // PrepareRLimit creates rlimit structures for tracee 37 | // TimeLimit in s, SizeLimit in byte 38 | func (r *RLimits) PrepareRLimit() []RLimit { 39 | var ret []RLimit 40 | if r.CPU > 0 { 41 | cpuHard := r.CPUHard 42 | if cpuHard < r.CPU { 43 | cpuHard = r.CPU 44 | } 45 | 46 | ret = append(ret, RLimit{ 47 | Res: syscall.RLIMIT_CPU, 48 | Rlim: getRlimit(r.CPU, cpuHard), 49 | }) 50 | } 51 | if r.Data > 0 { 52 | ret = append(ret, RLimit{ 53 | Res: syscall.RLIMIT_DATA, 54 | Rlim: getRlimit(r.Data, r.Data), 55 | }) 56 | } 57 | if r.FileSize > 0 { 58 | ret = append(ret, RLimit{ 59 | Res: syscall.RLIMIT_FSIZE, 60 | Rlim: getRlimit(r.FileSize, r.FileSize), 61 | }) 62 | } 63 | if r.Stack > 0 { 64 | ret = append(ret, RLimit{ 65 | Res: syscall.RLIMIT_STACK, 66 | Rlim: getRlimit(r.Stack, r.Stack), 67 | }) 68 | } 69 | if r.AddressSpace > 0 { 70 | ret = append(ret, RLimit{ 71 | Res: syscall.RLIMIT_AS, 72 | Rlim: getRlimit(r.AddressSpace, r.AddressSpace), 73 | }) 74 | } 75 | if r.OpenFile > 0 { 76 | ret = append(ret, RLimit{ 77 | Res: syscall.RLIMIT_NOFILE, 78 | Rlim: getRlimit(r.OpenFile, r.OpenFile), 79 | }) 80 | } 81 | if r.DisableCore { 82 | ret = append(ret, RLimit{ 83 | Res: syscall.RLIMIT_CORE, 84 | Rlim: getRlimit(0, 0), 85 | }) 86 | } 87 | return ret 88 | } 89 | 90 | func (r RLimit) String() string { 91 | t := "" 92 | switch r.Res { 93 | case syscall.RLIMIT_CPU: 94 | return fmt.Sprintf("CPU[%d s:%d s]", r.Rlim.Cur, r.Rlim.Max) 95 | case syscall.RLIMIT_NOFILE: 96 | return fmt.Sprintf("OpenFile[%d:%d]", r.Rlim.Cur, r.Rlim.Max) 97 | case syscall.RLIMIT_DATA: 98 | t = "Data" 99 | case syscall.RLIMIT_FSIZE: 100 | t = "File" 101 | case syscall.RLIMIT_STACK: 102 | t = "Stack" 103 | case syscall.RLIMIT_AS: 104 | t = "AddressSpace" 105 | case syscall.RLIMIT_CORE: 106 | t = "Core" 107 | } 108 | return fmt.Sprintf("%s[%v:%v]", t, runner.Size(r.Rlim.Cur), runner.Size(r.Rlim.Max)) 109 | } 110 | 111 | func (r RLimits) String() string { 112 | var sb strings.Builder 113 | sb.WriteString("RLimits[") 114 | for i, rl := range r.PrepareRLimit() { 115 | if i > 0 { 116 | sb.WriteByte(',') 117 | } 118 | sb.WriteString(rl.String()) 119 | } 120 | sb.WriteString("]") 121 | return sb.String() 122 | } 123 | -------------------------------------------------------------------------------- /pkg/rlimit/rlimit_test.go: -------------------------------------------------------------------------------- 1 | //go:build linux 2 | 3 | package rlimit 4 | 5 | import ( 6 | "syscall" 7 | "testing" 8 | ) 9 | 10 | func TestPrepareRLimit(t *testing.T) { 11 | tests := []struct { 12 | name string 13 | rl RLimits 14 | expect []int 15 | }{ 16 | { 17 | name: "Empty", 18 | rl: RLimits{}, 19 | expect: []int{}, 20 | }, 21 | { 22 | name: "CPU only", 23 | rl: RLimits{CPU: 1}, 24 | expect: []int{syscall.RLIMIT_CPU}, 25 | }, 26 | { 27 | name: "Data only", 28 | rl: RLimits{Data: 1024}, 29 | expect: []int{syscall.RLIMIT_DATA}, 30 | }, 31 | { 32 | name: "All fields", 33 | rl: RLimits{CPU: 1, CPUHard: 2, Data: 1024, FileSize: 2048, Stack: 4096, AddressSpace: 8192, OpenFile: 16, DisableCore: true}, 34 | expect: []int{syscall.RLIMIT_CPU, syscall.RLIMIT_DATA, syscall.RLIMIT_FSIZE, syscall.RLIMIT_STACK, syscall.RLIMIT_AS, syscall.RLIMIT_NOFILE, syscall.RLIMIT_CORE}, 35 | }, 36 | { 37 | name: "DisableCore only", 38 | rl: RLimits{DisableCore: true}, 39 | expect: []int{syscall.RLIMIT_CORE}, 40 | }, 41 | } 42 | 43 | for _, tt := range tests { 44 | t.Run(tt.name, func(t *testing.T) { 45 | rls := tt.rl.PrepareRLimit() 46 | if len(rls) != len(tt.expect) { 47 | t.Fatalf("expected %d rlimits, got %d", len(tt.expect), len(rls)) 48 | } 49 | for i, r := range rls { 50 | if r.Res != tt.expect[i] { 51 | t.Errorf("expected Res %d at %d, got %d", tt.expect[i], i, r.Res) 52 | } 53 | } 54 | }) 55 | } 56 | } 57 | 58 | func TestRLimitString(t *testing.T) { 59 | tests := []struct { 60 | name string 61 | rl RLimit 62 | want string 63 | }{ 64 | { 65 | name: "CPU", 66 | rl: RLimit{Res: syscall.RLIMIT_CPU, Rlim: syscall.Rlimit{Cur: 1, Max: 2}}, 67 | want: "CPU[1 s:2 s]", 68 | }, 69 | { 70 | name: "NOFILE", 71 | rl: RLimit{Res: syscall.RLIMIT_NOFILE, Rlim: syscall.Rlimit{Cur: 10, Max: 20}}, 72 | want: "OpenFile[10:20]", 73 | }, 74 | { 75 | name: "DATA", 76 | rl: RLimit{Res: syscall.RLIMIT_DATA, Rlim: syscall.Rlimit{Cur: 1024, Max: 2048}}, 77 | want: "Data[1.0 KiB:2.0 KiB]", 78 | }, 79 | { 80 | name: "FSIZE", 81 | rl: RLimit{Res: syscall.RLIMIT_FSIZE, Rlim: syscall.Rlimit{Cur: 100, Max: 200}}, 82 | want: "File[100 B:200 B]", 83 | }, 84 | { 85 | name: "STACK", 86 | rl: RLimit{Res: syscall.RLIMIT_STACK, Rlim: syscall.Rlimit{Cur: 4096, Max: 8192}}, 87 | want: "Stack[4.0 KiB:8.0 KiB]", 88 | }, 89 | { 90 | name: "AS", 91 | rl: RLimit{Res: syscall.RLIMIT_AS, Rlim: syscall.Rlimit{Cur: 123, Max: 456}}, 92 | want: "AddressSpace[123 B:456 B]", 93 | }, 94 | { 95 | name: "CORE", 96 | rl: RLimit{Res: syscall.RLIMIT_CORE, Rlim: syscall.Rlimit{Cur: 0, Max: 0}}, 97 | want: "Core[0 B:0 B]", 98 | }, 99 | } 100 | 101 | for _, tt := range tests { 102 | t.Run(tt.name, func(t *testing.T) { 103 | got := tt.rl.String() 104 | if got != tt.want { 105 | t.Errorf("got %q, want %q", got, tt.want) 106 | } 107 | }) 108 | } 109 | } 110 | 111 | func TestRLimitsString(t *testing.T) { 112 | rl := RLimits{ 113 | CPU: 1, 114 | CPUHard: 2, 115 | Data: 1024, 116 | FileSize: 2048, 117 | Stack: 4096, 118 | AddressSpace: 8192, 119 | OpenFile: 16, 120 | DisableCore: true, 121 | } 122 | want := "RLimits[CPU[1 s:2 s],Data[1.0 KiB:1.0 KiB],File[2.0 KiB:2.0 KiB],Stack[4.0 KiB:4.0 KiB],AddressSpace[8.0 KiB:8.0 KiB],OpenFile[16:16],Core[0 B:0 B]]" 123 | got := rl.String() 124 | if got != want { 125 | t.Errorf("got %q, want %q", got, want) 126 | } 127 | } 128 | 129 | func TestRLimitsString_Empty(t *testing.T) { 130 | rl := RLimits{} 131 | want := "RLimits[]" 132 | got := rl.String() 133 | if got != want { 134 | t.Errorf("got %q, want %q", got, want) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /pkg/seccomp/filter_linux.go: -------------------------------------------------------------------------------- 1 | // Package seccomp provides a generated filter format for seccomp filter 2 | package seccomp 3 | 4 | import "syscall" 5 | 6 | // Filter is the BPF seccomp filter value 7 | type Filter []syscall.SockFilter 8 | 9 | // SockFprog converts Filter to SockFprog for seccomp syscall 10 | func (f Filter) SockFprog() *syscall.SockFprog { 11 | b := []syscall.SockFilter(f) 12 | return &syscall.SockFprog{ 13 | Len: uint16(len(b)), 14 | Filter: &b[0], 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/action.go: -------------------------------------------------------------------------------- 1 | package libseccomp 2 | 3 | // Action is seccomp trap action 4 | type Action uint32 5 | 6 | // Action defines seccomp action to the syscall 7 | // default value 0 is invalid 8 | const ( 9 | ActionAllow Action = iota + 1 10 | ActionErrno 11 | ActionTrace 12 | ActionKill 13 | ) 14 | 15 | // MsgDisallow, Msghandle defines the action needed when trapped by 16 | // seccomp filter 17 | const ( 18 | MsgDisallow int16 = iota + 1 19 | MsgHandle 20 | ) 21 | 22 | // Action get the basic action 23 | func (a Action) Action() Action { 24 | return Action(a & 0xffff) 25 | } 26 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/action_linux.go: -------------------------------------------------------------------------------- 1 | package libseccomp 2 | 3 | import ( 4 | libseccomp "github.com/elastic/go-seccomp-bpf" 5 | ) 6 | 7 | // ToSeccompAction convert action to libseccomp compatible action 8 | func ToSeccompAction(a Action) libseccomp.Action { 9 | var action libseccomp.Action 10 | switch a.Action() { 11 | case ActionAllow: 12 | action = libseccomp.ActionAllow 13 | case ActionErrno: 14 | action = libseccomp.ActionErrno 15 | case ActionTrace: 16 | action = libseccomp.ActionTrace 17 | default: 18 | action = libseccomp.ActionKillProcess 19 | } 20 | // the least 16 bit of ret value is SECCOMP_RET_DATA 21 | // although it might not officially supported by go-seccomp-bpf 22 | // action = action.WithReturnData(int(a.ReturnCode())) 23 | return action 24 | } 25 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/builder_linux.go: -------------------------------------------------------------------------------- 1 | package libseccomp 2 | 3 | import ( 4 | "syscall" 5 | 6 | "github.com/criyle/go-sandbox/pkg/seccomp" 7 | libseccomp "github.com/elastic/go-seccomp-bpf" 8 | "golang.org/x/net/bpf" 9 | ) 10 | 11 | // Builder is used to build the filter 12 | type Builder struct { 13 | Allow, Trace []string 14 | Default Action 15 | } 16 | 17 | var actTrace = libseccomp.ActionTrace 18 | 19 | // Build builds the filter 20 | func (b *Builder) Build() (seccomp.Filter, error) { 21 | policy := libseccomp.Policy{ 22 | DefaultAction: ToSeccompAction(b.Default), 23 | Syscalls: []libseccomp.SyscallGroup{ 24 | { 25 | Action: libseccomp.ActionAllow, 26 | Names: b.Allow, 27 | }, 28 | { 29 | Action: actTrace, 30 | Names: b.Trace, 31 | }, 32 | }, 33 | } 34 | program, err := policy.Assemble() 35 | if err != nil { 36 | return nil, err 37 | } 38 | return ExportBPF(program) 39 | } 40 | 41 | // ExportBPF convert libseccomp filter to kernel readable BPF content 42 | func ExportBPF(filter []bpf.Instruction) (seccomp.Filter, error) { 43 | raw, err := bpf.Assemble(filter) 44 | if err != nil { 45 | return nil, err 46 | } 47 | return sockFilter(raw), nil 48 | } 49 | 50 | func sockFilter(raw []bpf.RawInstruction) []syscall.SockFilter { 51 | filter := make([]syscall.SockFilter, 0, len(raw)) 52 | for _, instruction := range raw { 53 | filter = append(filter, syscall.SockFilter{ 54 | Code: instruction.Op, 55 | Jt: instruction.Jt, 56 | Jf: instruction.Jf, 57 | K: instruction.K, 58 | }) 59 | } 60 | return filter 61 | } 62 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/doc.go: -------------------------------------------------------------------------------- 1 | // Package libseccomp provides a wrapper for "github.com/seccomp/libseccomp-golang" 2 | package libseccomp 3 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/seccomp_linux_test.go: -------------------------------------------------------------------------------- 1 | package libseccomp 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/criyle/go-sandbox/pkg/seccomp" 7 | ) 8 | 9 | var ( 10 | defaultSyscallAllows = []string{ 11 | "read", "write", "readv", "writev", "close", "fstat", "lseek", "dup", "dup2", "dup3", "ioctl", "fcntl", "fadvise64", 12 | "mmap", "mprotect", "munmap", "brk", "mremap", "msync", "mincore", "madvise", 13 | "rt_sigaction", "rt_sigprocmask", "rt_sigreturn", "rt_sigpending", "sigaltstack", 14 | "getcwd", "exit", "exit_group", "arch_prctl", 15 | "gettimeofday", "getrlimit", "getrusage", "times", "time", "clock_gettime", "restart_syscall", 16 | } 17 | 18 | defaultSyscallTraces = []string{ 19 | "execve", "open", "openat", "unlink", "unlinkat", "readlink", "readlinkat", "lstat", "stat", "access", "faccessat", 20 | } 21 | ) 22 | 23 | func TestBuildFilter(t *testing.T) { 24 | _, err := buildFilterMock() 25 | if err != nil { 26 | t.Error("BuildFilter failed") 27 | } 28 | } 29 | 30 | // BenchmarkBuildDefaultFilter is about 0.2ms/op 31 | func BenchmarkBuildDefaultFilter(b *testing.B) { 32 | for i := 0; i < b.N; i++ { 33 | builder := Builder{ 34 | Allow: defaultSyscallAllows, 35 | Trace: defaultSyscallTraces, 36 | Default: ActionTrace, 37 | } 38 | builder.Build() 39 | } 40 | } 41 | 42 | func buildFilterMock() (seccomp.Filter, error) { 43 | b := Builder{ 44 | Allow: []string{"fork"}, 45 | Trace: []string{"execve"}, 46 | Default: ActionTrace, 47 | } 48 | return b.Build() 49 | } 50 | -------------------------------------------------------------------------------- /pkg/seccomp/libseccomp/syscall_name_linux.go: -------------------------------------------------------------------------------- 1 | package libseccomp 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/elastic/go-seccomp-bpf/arch" 7 | ) 8 | 9 | var info, errInfo = arch.GetInfo("") 10 | 11 | // ToSyscallName convert syscallno to syscall name 12 | func ToSyscallName(sysno uint) (string, error) { 13 | if errInfo != nil { 14 | return "", errInfo 15 | } 16 | n, ok := info.SyscallNumbers[int(sysno)] 17 | if !ok { 18 | return "", fmt.Errorf("syscall number does not exist: %d", sysno) 19 | } 20 | return n, nil 21 | } 22 | -------------------------------------------------------------------------------- /pkg/unixsocket/benchmark_linux_test.go: -------------------------------------------------------------------------------- 1 | package unixsocket 2 | 3 | import "testing" 4 | 5 | func BenchmarkBaseline(b *testing.B) { 6 | s, t, err := NewSocketPair() 7 | if err != nil { 8 | b.Fatal(err) 9 | } 10 | m := make([]byte, 1024) 11 | b.ResetTimer() 12 | go func() { 13 | msg := []byte("message") 14 | for i := 0; i < b.N; i++ { 15 | s.SendMsg(msg, Msg{}) 16 | } 17 | }() 18 | 19 | for i := 0; i < b.N; i++ { 20 | t.RecvMsg(m) 21 | } 22 | } 23 | 24 | func BenchmarkGoroutine(b *testing.B) { 25 | s, t, err := NewSocketPair() 26 | if err != nil { 27 | b.Fatal(err) 28 | } 29 | m := make([]byte, 1024) 30 | b.ResetTimer() 31 | go func() { 32 | msg := []byte("message") 33 | for i := 0; i < b.N; i++ { 34 | s.SendMsg(msg, Msg{}) 35 | } 36 | }() 37 | 38 | for i := 0; i < b.N; i++ { 39 | c := make(chan struct{}) 40 | go func() { 41 | defer close(c) 42 | t.RecvMsg(m) 43 | }() 44 | <-c 45 | } 46 | } 47 | 48 | func BenchmarkChannel(b *testing.B) { 49 | c := make(chan []byte) 50 | benchGoroutine(b, c) 51 | } 52 | 53 | func BenchmarkChannelBuffed(b *testing.B) { 54 | c := make(chan []byte, 1) 55 | benchGoroutine(b, c) 56 | } 57 | 58 | func BenchmarkChannelBuffed4(b *testing.B) { 59 | c := make(chan []byte, 4) 60 | benchGoroutine(b, c) 61 | } 62 | 63 | func BenchmarkEmptyGoroutine(b *testing.B) { 64 | for i := 0; i < b.N; i++ { 65 | c := make(chan struct{}) 66 | go func() { 67 | close(c) 68 | }() 69 | <-c 70 | } 71 | } 72 | 73 | func benchGoroutine(b *testing.B, c chan []byte) { 74 | s, t, err := NewSocketPair() 75 | if err != nil { 76 | b.Fatal(err) 77 | } 78 | 79 | go func() { 80 | msg := []byte("message") 81 | for i := 0; i < b.N; i++ { 82 | s.SendMsg(msg, Msg{}) 83 | } 84 | }() 85 | 86 | b.ResetTimer() 87 | go func() { 88 | m := make([]byte, 1024) 89 | for i := 0; i < b.N; i++ { 90 | t.RecvMsg(m) 91 | c <- m 92 | } 93 | }() 94 | 95 | for i := 0; i < b.N; i++ { 96 | <-c 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /pkg/unixsocket/socket_linux.go: -------------------------------------------------------------------------------- 1 | // Package unixsocket provides wrapper for Linux unix socket to send and recv oob messages 2 | // including fd and user credential. 3 | package unixsocket 4 | 5 | import ( 6 | "bytes" 7 | "fmt" 8 | "net" 9 | "os" 10 | "syscall" 11 | ) 12 | 13 | // oob size default to page size 14 | const oobSize = 4 << 10 // 4kb 15 | 16 | // Socket wrappers a unix socket connection 17 | type Socket struct { 18 | *net.UnixConn 19 | sendBuff []byte 20 | recvBuff []byte 21 | } 22 | 23 | // Msg is the oob msg with the message 24 | type Msg struct { 25 | Fds []int // unix rights 26 | Cred *syscall.Ucred // unix credential 27 | } 28 | 29 | func newSocket(conn *net.UnixConn) *Socket { 30 | return &Socket{ 31 | UnixConn: conn, 32 | sendBuff: make([]byte, oobSize), 33 | recvBuff: make([]byte, oobSize), 34 | } 35 | } 36 | 37 | // NewSocket creates Socket conn struct using existing unix socket fd 38 | // creates by socketpair or net.DialUnix and mark it as close_on_exec (avoid fd leak) 39 | // it need SOCK_SEQPACKET socket for reliable transfer 40 | // it will need SO_PASSCRED to pass unix credential, Notice: in the documentation, 41 | // if cred is not specified, self information will be sent 42 | func NewSocket(fd int) (*Socket, error) { 43 | syscall.SetNonblock(fd, true) 44 | syscall.CloseOnExec(fd) 45 | 46 | file := os.NewFile(uintptr(fd), "unix-socket") 47 | if file == nil { 48 | return nil, fmt.Errorf("new socket: %d is not a valid fd", fd) 49 | } 50 | defer file.Close() 51 | 52 | conn, err := net.FileConn(file) 53 | if err != nil { 54 | return nil, fmt.Errorf("new socket: fileconn: %w", err) 55 | } 56 | 57 | unixConn, ok := conn.(*net.UnixConn) 58 | if !ok { 59 | conn.Close() 60 | return nil, fmt.Errorf("new socket: %d is not a valid unix socket connection", fd) 61 | } 62 | return newSocket(unixConn), nil 63 | } 64 | 65 | // NewSocketPair creates connected unix socketpair using SOCK_SEQPACKET 66 | func NewSocketPair() (*Socket, *Socket, error) { 67 | fd, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) 68 | if err != nil { 69 | return nil, nil, fmt.Errorf("new socket pair: socketpair: %w", err) 70 | } 71 | 72 | ins, err := NewSocket(fd[0]) 73 | if err != nil { 74 | syscall.Close(fd[0]) 75 | syscall.Close(fd[1]) 76 | return nil, nil, fmt.Errorf("new socket pair: sender: %w", err) 77 | } 78 | 79 | outs, err := NewSocket(fd[1]) 80 | if err != nil { 81 | ins.Close() 82 | syscall.Close(fd[1]) 83 | return nil, nil, fmt.Errorf("new socket pair: receiver: %w", err) 84 | } 85 | 86 | return ins, outs, nil 87 | } 88 | 89 | // SetPassCred set sockopt for pass cred for unix socket 90 | func (s *Socket) SetPassCred(option int) error { 91 | sysconn, err := s.SyscallConn() 92 | if err != nil { 93 | return err 94 | } 95 | return sysconn.Control(func(fd uintptr) { 96 | syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_PASSCRED, option) 97 | }) 98 | } 99 | 100 | // SendMsg sendmsg to unix socket and encode possible unix right / credential 101 | func (s *Socket) SendMsg(b []byte, m Msg) error { 102 | oob := bytes.NewBuffer(s.sendBuff[:0]) 103 | if len(m.Fds) > 0 { 104 | oob.Write(syscall.UnixRights(m.Fds...)) 105 | } 106 | if m.Cred != nil { 107 | oob.Write(syscall.UnixCredentials(m.Cred)) 108 | } 109 | 110 | _, _, err := s.WriteMsgUnix(b, oob.Bytes(), nil) 111 | if err != nil { 112 | return err 113 | } 114 | return nil 115 | } 116 | 117 | // RecvMsg recvmsg from unix socket and parse possible unix right / credential 118 | func (s *Socket) RecvMsg(b []byte) (int, Msg, error) { 119 | var msg Msg 120 | n, oobn, _, _, err := s.ReadMsgUnix(b, s.recvBuff) 121 | if err != nil { 122 | return 0, msg, err 123 | } 124 | // parse oob msg 125 | msgs, err := syscall.ParseSocketControlMessage(s.recvBuff[:oobn]) 126 | if err != nil { 127 | return 0, msg, err 128 | } 129 | msg, err = parseMsg(msgs) 130 | if err != nil { 131 | return 0, msg, err 132 | } 133 | return n, msg, nil 134 | } 135 | 136 | func parseMsg(msgs []syscall.SocketControlMessage) (msg Msg, err error) { 137 | defer func() { 138 | if err != nil { 139 | for _, f := range msg.Fds { 140 | syscall.Close(f) 141 | } 142 | msg.Fds = nil 143 | } 144 | }() 145 | for _, m := range msgs { 146 | if m.Header.Level != syscall.SOL_SOCKET { 147 | continue 148 | } 149 | 150 | switch m.Header.Type { 151 | case syscall.SCM_CREDENTIALS: 152 | cred, err := syscall.ParseUnixCredentials(&m) 153 | if err != nil { 154 | return msg, err 155 | } 156 | msg.Cred = cred 157 | 158 | case syscall.SCM_RIGHTS: 159 | fds, err := syscall.ParseUnixRights(&m) 160 | if err != nil { 161 | return msg, err 162 | } 163 | msg.Fds = fds 164 | } 165 | } 166 | return msg, nil 167 | } 168 | -------------------------------------------------------------------------------- /pkg/unixsocket/socket_linux_test.go: -------------------------------------------------------------------------------- 1 | package unixsocket 2 | 3 | import ( 4 | "bytes" 5 | "os" 6 | "syscall" 7 | "testing" 8 | ) 9 | 10 | func TestBaseline(t *testing.T) { 11 | a, b, err := NewSocketPair() 12 | if err != nil { 13 | t.Fatal(err) 14 | } 15 | m := make([]byte, 1024) 16 | 17 | go func() { 18 | msg := []byte("message") 19 | a.SendMsg(msg, Msg{}) 20 | }() 21 | 22 | n, _, err := b.RecvMsg(m) 23 | if err != nil { 24 | t.Fatal(err) 25 | } 26 | 27 | if !bytes.Equal(m[:n], []byte("message")) { 28 | t.Fatal("not equal") 29 | } 30 | } 31 | 32 | func TestSendRecvMsg_Fds(t *testing.T) { 33 | a, b, err := NewSocketPair() 34 | if err != nil { 35 | t.Fatal(err) 36 | } 37 | defer a.Close() 38 | defer b.Close() 39 | 40 | // Create a file to send its fd 41 | tmpfile, err := os.CreateTemp("", "unixsocket-fd") 42 | if err != nil { 43 | t.Fatal(err) 44 | } 45 | defer os.Remove(tmpfile.Name()) 46 | defer tmpfile.Close() 47 | 48 | msg := []byte("fdtest") 49 | go func() { 50 | a.SendMsg(msg, Msg{Fds: []int{int(tmpfile.Fd())}}) 51 | }() 52 | 53 | buf := make([]byte, 64) 54 | n, m, err := b.RecvMsg(buf) 55 | if err != nil { 56 | t.Fatal(err) 57 | } 58 | if !bytes.Equal(buf[:n], msg) { 59 | t.Errorf("RecvMsg got %q, want %q", buf[:n], msg) 60 | } 61 | if len(m.Fds) != 1 { 62 | t.Errorf("expected 1 fd, got %d", len(m.Fds)) 63 | } 64 | if m.Fds != nil { 65 | syscall.Close(m.Fds[0]) 66 | } 67 | } 68 | 69 | func TestSendRecvMsg_Cred(t *testing.T) { 70 | if os.Geteuid() != 0 { 71 | t.Skip("skipping credential test: requires root privileges") 72 | return 73 | } 74 | a, b, err := NewSocketPair() 75 | if err != nil { 76 | t.Fatal(err) 77 | } 78 | defer a.Close() 79 | defer b.Close() 80 | 81 | // Enable credential passing 82 | if err := a.SetPassCred(1); err != nil { 83 | t.Fatal(err) 84 | } 85 | if err := b.SetPassCred(1); err != nil { 86 | t.Fatal(err) 87 | } 88 | 89 | msg := []byte("credtest") 90 | go func() { 91 | a.SendMsg(msg, Msg{Cred: &syscall.Ucred{Pid: 123, Uid: 456, Gid: 789}}) 92 | }() 93 | 94 | buf := make([]byte, 64) 95 | n, m, err := b.RecvMsg(buf) 96 | if err != nil { 97 | t.Fatal(err) 98 | } 99 | if !bytes.Equal(buf[:n], msg) { 100 | t.Errorf("RecvMsg got %q, want %q", buf[:n], msg) 101 | } 102 | if m.Cred == nil { 103 | t.Error("expected credential, got nil") 104 | } 105 | } 106 | 107 | func TestNewSocketPair_Close(t *testing.T) { 108 | a, b, err := NewSocketPair() 109 | if err != nil { 110 | t.Fatal(err) 111 | } 112 | if err := a.Close(); err != nil { 113 | t.Errorf("a.Close() error: %v", err) 114 | } 115 | if err := b.Close(); err != nil { 116 | t.Errorf("b.Close() error: %v", err) 117 | } 118 | } 119 | 120 | func TestNewSocket_InvalidFd(t *testing.T) { 121 | // Use an invalid fd 122 | _, err := NewSocket(-1) 123 | if err == nil { 124 | t.Error("expected error for invalid fd, got nil") 125 | } 126 | } 127 | 128 | func TestSetPassCred_InvalidSocket(t *testing.T) { 129 | a, b, err := NewSocketPair() 130 | if err != nil { 131 | t.Fatal(err) 132 | } 133 | defer a.Close() 134 | defer b.Close() 135 | 136 | // Close the socket to make it invalid 137 | a.Close() 138 | err = a.SetPassCred(1) 139 | if err == nil { 140 | t.Error("expected error on SetPassCred for closed socket, got nil") 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /ptracer/context_helper_linux.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | 7 | unix "golang.org/x/sys/unix" 8 | ) 9 | 10 | // TODO: make this method not to call ptrace too much 11 | func ptraceReadStr(pid int, addr uintptr, buff []byte) { 12 | syscall.PtracePeekData(pid, addr, buff) 13 | } 14 | 15 | func processVMReadv(pid int, localIov, remoteIov []unix.Iovec, 16 | flags uintptr) (r1, r2 uintptr, err syscall.Errno) { 17 | return syscall.Syscall6(unix.SYS_PROCESS_VM_READV, uintptr(pid), 18 | uintptr(unsafe.Pointer(&localIov[0])), uintptr(len(localIov)), 19 | uintptr(unsafe.Pointer(&remoteIov[0])), uintptr(len(remoteIov)), 20 | flags) 21 | } 22 | 23 | func vmRead(pid int, addr uintptr, buff []byte) (int, error) { 24 | l := len(buff) 25 | localIov := getIovecs(&buff[0], l) 26 | remoteIov := getIovecs((*byte)(unsafe.Pointer(addr)), l) 27 | n, _, err := processVMReadv(pid, localIov, remoteIov, uintptr(0)) 28 | if err == 0 { 29 | return int(n), nil 30 | } 31 | return int(n), err 32 | } 33 | 34 | func getIovecs(base *byte, l int) []unix.Iovec { 35 | return []unix.Iovec{getIovec(base, l)} 36 | } 37 | 38 | func vmReadStr(pid int, addr uintptr, buff []byte) error { 39 | // Handle unaligned address: calculate remaining bytes to page boundary 40 | totalRead := 0 // Total bytes read so far 41 | // Calculate distance to next page boundary, nextRead is the number of bytes to read 42 | nextRead := pageSize - int(addr%uintptr(pageSize)) 43 | if nextRead == 0 { 44 | nextRead = pageSize // If exactly at page boundary, use full page size 45 | } 46 | 47 | // Read in a loop until buffer is full or termination condition is met 48 | for len(buff) > 0 { 49 | // If remaining buffer is smaller than planned read size, reduce read size 50 | if restToRead := len(buff); restToRead < nextRead { 51 | nextRead = restToRead 52 | } 53 | 54 | // Read data from current position 55 | curRead, err := vmRead(pid, addr+uintptr(totalRead), buff[:nextRead]) 56 | if err != nil { 57 | return err // Read error 58 | } 59 | if curRead == 0 { 60 | break // No more data to read 61 | } 62 | if hasNull(buff[:curRead]) { 63 | break // Found string terminator 64 | } 65 | 66 | // Update counters and buffer 67 | totalRead += curRead // Update total bytes read 68 | buff = buff[curRead:] // Move buffer pointer 69 | nextRead = pageSize // Reset to full page size 70 | } 71 | return nil 72 | } 73 | 74 | func hasNull(buff []byte) bool { 75 | for _, b := range buff { 76 | if b == 0 { 77 | return true 78 | } 79 | } 80 | return false 81 | } 82 | 83 | func clen(b []byte) int { 84 | for i := 0; i < len(b); i++ { 85 | if b[i] == 0 { 86 | return i 87 | } 88 | } 89 | return len(b) + 1 90 | } 91 | -------------------------------------------------------------------------------- /ptracer/context_linux.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | "os" 5 | "syscall" 6 | ) 7 | 8 | // Context is the context for current syscall trap 9 | // used to retrive syscall number and arguments 10 | type Context struct { 11 | // Pid is current context process pid 12 | Pid int 13 | // current reg context (platform dependent) 14 | regs syscall.PtraceRegs 15 | } 16 | 17 | var ( 18 | // UseVMReadv determine whether use ProcessVMReadv syscall to read str 19 | // initial true and becomes false if tried and failed with ENOSYS 20 | UseVMReadv = true 21 | pageSize = 4 << 10 22 | ) 23 | 24 | func init() { 25 | pageSize = os.Getpagesize() 26 | } 27 | 28 | func getTrapContext(pid int) (*Context, error) { 29 | var regs syscall.PtraceRegs 30 | //err := syscall.PtraceGetRegs(pid, ®s) 31 | err := ptraceGetRegSet(pid, ®s) 32 | if err != nil { 33 | return nil, err 34 | } 35 | return &Context{ 36 | Pid: pid, 37 | regs: regs, 38 | }, nil 39 | } 40 | 41 | // GetString get the string from process data segment 42 | func (c *Context) GetString(addr uintptr) string { 43 | buff := make([]byte, syscall.PathMax) 44 | if UseVMReadv { 45 | if err := vmReadStr(c.Pid, addr, buff); err != nil { 46 | // if ENOSYS, then disable this function 47 | if no, ok := err.(syscall.Errno); ok { 48 | if no == syscall.ENOSYS { 49 | UseVMReadv = false 50 | } 51 | } 52 | } else { 53 | return string(buff[:clen(buff)]) 54 | } 55 | } 56 | syscall.PtracePeekData(c.Pid, addr, buff) 57 | return string(buff[:clen(buff)]) 58 | } 59 | -------------------------------------------------------------------------------- /ptracer/context_linux_amd64.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | "syscall" 5 | 6 | unix "golang.org/x/sys/unix" 7 | ) 8 | 9 | // SyscallNo get current syscall no 10 | func (c *Context) SyscallNo() uint { 11 | return uint(c.regs.Orig_rax) 12 | } 13 | 14 | // Arg0 gets the arg0 for the current syscall 15 | func (c *Context) Arg0() uint { 16 | return uint(c.regs.Rdi) 17 | } 18 | 19 | // Arg1 gets the arg1 for the current syscall 20 | func (c *Context) Arg1() uint { 21 | return uint(c.regs.Rsi) 22 | } 23 | 24 | // Arg2 gets the arg2 for the current syscall 25 | func (c *Context) Arg2() uint { 26 | return uint(c.regs.Rdx) 27 | } 28 | 29 | // Arg3 gets the arg3 for the current syscall 30 | func (c *Context) Arg3() uint { 31 | return uint(c.regs.R10) 32 | } 33 | 34 | // Arg4 gets the arg4 for the current syscall 35 | func (c *Context) Arg4() uint { 36 | return uint(c.regs.R8) 37 | } 38 | 39 | // Arg5 gets the arg5 for the current syscall 40 | func (c *Context) Arg5() uint { 41 | return uint(c.regs.R9) 42 | } 43 | 44 | // SetReturnValue set the return value if skip the syscall 45 | func (c *Context) SetReturnValue(retval int) { 46 | c.regs.Rax = uint64(retval) 47 | } 48 | 49 | func (c *Context) skipSyscall() error { 50 | c.regs.Orig_rax = ^uint64(0) //-1 51 | return syscall.PtraceSetRegs(c.Pid, &c.regs) 52 | } 53 | 54 | func getIovec(base *byte, l int) unix.Iovec { 55 | return unix.Iovec{ 56 | Base: base, 57 | Len: uint64(l), 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /ptracer/context_linux_arm.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | "syscall" 5 | 6 | unix "golang.org/x/sys/unix" 7 | ) 8 | 9 | // SyscallNo get current syscall no 10 | func (c *Context) SyscallNo() uint { 11 | return uint(c.regs.Uregs[7]) // R7 12 | } 13 | 14 | // Arg0 gets the arg0 for the current syscall 15 | func (c *Context) Arg0() uint { 16 | return uint(c.regs.Uregs[17]) //Orig_R0 17 | } 18 | 19 | // Arg1 gets the arg1 for the current syscall 20 | func (c *Context) Arg1() uint { 21 | return uint(c.regs.Uregs[1]) // R1 22 | } 23 | 24 | // Arg2 gets the arg2 for the current syscall 25 | func (c *Context) Arg2() uint { 26 | return uint(c.regs.Uregs[2]) // R2 27 | } 28 | 29 | // Arg3 gets the arg3 for the current syscall 30 | func (c *Context) Arg3() uint { 31 | return uint(c.regs.Uregs[3]) // R3 32 | } 33 | 34 | // Arg4 gets the arg4 for the current syscall 35 | func (c *Context) Arg4() uint { 36 | return uint(c.regs.Uregs[4]) // R4 37 | } 38 | 39 | // Arg5 gets the arg5 for the current syscall 40 | func (c *Context) Arg5() uint { 41 | return uint(c.regs.Uregs[5]) //R5 42 | } 43 | 44 | // SetReturnValue set the return value if skip the syscall 45 | func (c *Context) SetReturnValue(retval int) { 46 | c.regs.Uregs[0] = uint32(retval) // R0 47 | } 48 | 49 | func (c *Context) skipSyscall() error { 50 | err := syscall.PtraceSetRegs(c.Pid, &c.regs) 51 | if err != nil { 52 | return err 53 | } 54 | return ptraceArmSetSyscall(c.Pid, -1) 55 | } 56 | 57 | func getIovec(base *byte, l int) unix.Iovec { 58 | return unix.Iovec{ 59 | Base: base, 60 | Len: uint32(l), 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /ptracer/context_linux_arm64.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | unix "golang.org/x/sys/unix" 5 | ) 6 | 7 | // SyscallNo get current syscall no 8 | func (c *Context) SyscallNo() uint { 9 | return uint(c.regs.Regs[8]) // R8 10 | } 11 | 12 | // Arg0 gets the arg0 for the current syscall 13 | func (c *Context) Arg0() uint { 14 | return uint(c.regs.Regs[0]) //R0 15 | } 16 | 17 | // Arg1 gets the arg1 for the current syscall 18 | func (c *Context) Arg1() uint { 19 | return uint(c.regs.Regs[1]) // R1 20 | } 21 | 22 | // Arg2 gets the arg2 for the current syscall 23 | func (c *Context) Arg2() uint { 24 | return uint(c.regs.Regs[2]) // R2 25 | } 26 | 27 | // Arg3 gets the arg3 for the current syscall 28 | func (c *Context) Arg3() uint { 29 | return uint(c.regs.Regs[3]) // R3 30 | } 31 | 32 | // Arg4 gets the arg4 for the current syscall 33 | func (c *Context) Arg4() uint { 34 | return uint(c.regs.Regs[4]) // R4 35 | } 36 | 37 | // Arg5 gets the arg5 for the current syscall 38 | func (c *Context) Arg5() uint { 39 | return uint(c.regs.Regs[5]) //R5 40 | } 41 | 42 | // SetReturnValue set the return value if skip the syscall 43 | func (c *Context) SetReturnValue(retval int) { 44 | c.regs.Regs[0] = uint64(retval) // R0 45 | } 46 | 47 | func (c *Context) skipSyscall() error { 48 | err := ptraceSetRegSet(c.Pid, &c.regs) 49 | if err != nil { 50 | return err 51 | } 52 | return ptraceArm64SetSyscall(c.Pid, -1) 53 | } 54 | 55 | func getIovec(base *byte, l int) unix.Iovec { 56 | return unix.Iovec{ 57 | Base: base, 58 | Len: uint64(l), 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /ptracer/context_other.go: -------------------------------------------------------------------------------- 1 | //go:build !linux 2 | 3 | package ptracer 4 | 5 | // Context empty structure filler for other OS 6 | type Context struct { 7 | Pid int 8 | } 9 | 10 | func (c *Context) SyscallNo() uint { 11 | return 0 12 | } 13 | 14 | func (c *Context) Arg0() uint { 15 | return 0 16 | } 17 | 18 | func (c *Context) Arg1() uint { 19 | return 0 20 | } 21 | 22 | func (c *Context) Arg2() uint { 23 | return 0 24 | } 25 | 26 | func (c *Context) Arg3() uint { 27 | return 0 28 | } 29 | 30 | func (c *Context) Arg4() uint { 31 | return 0 32 | } 33 | 34 | func (c *Context) Arg5() uint { 35 | return 0 36 | } 37 | 38 | func (c *Context) SetReturnValue(retval int) { 39 | 40 | } 41 | 42 | func (c *Context) GetString(addr uintptr) string { 43 | return "" 44 | } 45 | -------------------------------------------------------------------------------- /ptracer/doc.go: -------------------------------------------------------------------------------- 1 | // Package ptracer provides platform independent ptrace pooling loop 2 | // interface to trace program syscalls on Linux. 3 | package ptracer 4 | -------------------------------------------------------------------------------- /ptracer/ptrace_linux.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | ) 7 | 8 | // ptrace constants 9 | const ( 10 | NT_PRSTATUS = 1 11 | NT_ARM_SYSTEM_CALL = 0x404 12 | 13 | PTRACE_SET_SYSCALL = 23 14 | ) 15 | 16 | func ptrace(request int, pid int, addr uintptr, data uintptr) (err error) { 17 | _, _, e1 := syscall.Syscall6(syscall.SYS_PTRACE, uintptr(request), uintptr(pid), uintptr(addr), uintptr(data), 0, 0) 18 | if e1 != 0 { 19 | err = e1 20 | } 21 | return 22 | } 23 | 24 | func ptraceGetRegSet(pid int, regs *syscall.PtraceRegs) error { 25 | iov := getIovec((*byte)(unsafe.Pointer(regs)), int(unsafe.Sizeof(*regs))) 26 | return ptrace(syscall.PTRACE_GETREGSET, pid, NT_PRSTATUS, uintptr(unsafe.Pointer(&iov))) 27 | } 28 | 29 | func ptraceSetRegSet(pid int, regs *syscall.PtraceRegs) error { 30 | iov := getIovec((*byte)(unsafe.Pointer(regs)), int(unsafe.Sizeof(*regs))) 31 | return ptrace(syscall.PTRACE_SETREGSET, pid, NT_PRSTATUS, uintptr(unsafe.Pointer(&iov))) 32 | } 33 | 34 | func ptraceArm64SetSyscall(pid int, syscallNo int) error { 35 | iov := getIovec((*byte)(unsafe.Pointer(&syscallNo)), int(unsafe.Sizeof(syscallNo))) 36 | return ptrace(syscall.PTRACE_SETREGSET, pid, NT_ARM_SYSTEM_CALL, uintptr(unsafe.Pointer(&iov))) 37 | } 38 | 39 | func ptraceArmSetSyscall(pid int, syscallNo int) error { 40 | return ptrace(PTRACE_SET_SYSCALL, pid, 0, uintptr(syscallNo)) 41 | } 42 | -------------------------------------------------------------------------------- /ptracer/tracer.go: -------------------------------------------------------------------------------- 1 | package ptracer 2 | 3 | import "github.com/criyle/go-sandbox/runner" 4 | 5 | // TraceAction defines the action returned by TraceHandle 6 | type TraceAction int 7 | 8 | const ( 9 | // TraceAllow does not do anything 10 | TraceAllow TraceAction = iota 11 | // TraceBan skips the syscall and set the return code specified by SetReturnCode 12 | TraceBan 13 | // TraceKill referred as dangerous action have been detected 14 | TraceKill 15 | ) 16 | 17 | // Tracer defines a ptracer instance 18 | type Tracer struct { 19 | Handler 20 | Runner 21 | runner.Limit 22 | } 23 | 24 | // Runner represents the process runner 25 | type Runner interface { 26 | // Starts starts the child process and return pid and error if failed 27 | // the child process should enable ptrace and should stop before ptrace 28 | Start() (int, error) 29 | } 30 | 31 | // Handler defines customized handler for traced syscall 32 | type Handler interface { 33 | // Handle returns action take to the traced program 34 | Handle(*Context) TraceAction 35 | 36 | // Debug prints debug information when in debug mode 37 | Debug(v ...interface{}) 38 | } 39 | -------------------------------------------------------------------------------- /runner/doc.go: -------------------------------------------------------------------------------- 1 | // Package runner provides common interface for program runner together with 2 | // common types including Result, Limit, Size and Status. 3 | // 4 | // # Status 5 | // 6 | // Status defines the program running result status including 7 | // 8 | // Normal 9 | // Program Error 10 | // Resource Limit Exceeded (Time / Memory / Output) 11 | // Unauthorized Access (Disallowed Syscall) 12 | // Runtime Error (Signaled / Nonzero Exit Status) 13 | // Program Runner Error 14 | // 15 | // # Size 16 | // 17 | // Size defines size in bytes, underlying type is uint64 so it 18 | // is effective to store up to EiB of size 19 | // 20 | // # Limit 21 | // 22 | // Limit defines Time & Memory restriction on program runner 23 | // 24 | // # Result 25 | // 26 | // Result defines program running result including 27 | // Status, ExitStatus, Detailed Error, Time, Memory, 28 | // SetupTime and RunningTime (in real clock) 29 | // 30 | // # Runner 31 | // 32 | // General interface to run a program, including a context 33 | // for cancellation 34 | package runner 35 | -------------------------------------------------------------------------------- /runner/limit.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | // Limit represents the resource limit for traced process 9 | type Limit struct { 10 | TimeLimit time.Duration // user CPU time limit (in ns) 11 | MemoryLimit Size // user memory limit (in bytes) 12 | } 13 | 14 | func (l Limit) String() string { 15 | return fmt.Sprintf("Limit[Time=%v, Memory=%v]", l.TimeLimit, l.MemoryLimit) 16 | } 17 | -------------------------------------------------------------------------------- /runner/ptrace/filehandler/fileset.go: -------------------------------------------------------------------------------- 1 | package filehandler 2 | 3 | import ( 4 | "path/filepath" 5 | "strings" 6 | ) 7 | 8 | // FileSet stores the file permissions in the hierarchical set 9 | type FileSet struct { 10 | Set map[string]bool 11 | SystemRoot bool 12 | } 13 | 14 | // FilePerm stores the permission apply to the file 15 | type FilePerm int 16 | 17 | // FilePermWrite / Read / Stat are permissions 18 | const ( 19 | FilePermWrite = iota + 1 20 | FilePermRead 21 | FilePermStat 22 | ) 23 | 24 | // NewFileSet creates the new file set 25 | func NewFileSet() FileSet { 26 | return FileSet{make(map[string]bool), false} 27 | } 28 | 29 | // IsInSetSmart same from uoj-judger 30 | func (s *FileSet) IsInSetSmart(name string) bool { 31 | if s.Set[name] { 32 | return true 33 | } 34 | if name == "/" && s.SystemRoot { 35 | return true 36 | } 37 | // check ... 38 | level := 0 39 | for level = 0; name != ""; level++ { 40 | if level == 1 && s.Set[name+"/*"] { 41 | return true 42 | } 43 | if s.Set[name+"/"] { 44 | return true 45 | } 46 | name = dirname(name) 47 | } 48 | if level == 1 && s.Set["/*"] { 49 | return true 50 | } 51 | if s.Set["/"] { 52 | return true 53 | } 54 | return false 55 | } 56 | 57 | // Add adds a single file path into the FileSet 58 | func (s *FileSet) Add(name string) { 59 | if name == "/" { 60 | s.SystemRoot = true 61 | } else { 62 | s.Set[name] = true 63 | } 64 | } 65 | 66 | // AddRange adds multiple files into the FileSet 67 | // If path is relative path, add according to the workPath 68 | func (s *FileSet) AddRange(names []string, workPath string) { 69 | for _, n := range names { 70 | if filepath.IsAbs(n) { 71 | if n == "/" { 72 | s.SystemRoot = true 73 | } else { 74 | s.Set[n] = true 75 | } 76 | } else { 77 | s.Set[filepath.Join(workPath, n)+"/"] = true 78 | } 79 | } 80 | } 81 | 82 | // FileSets aggregates multiple permissions including write / read / stat / soft ban 83 | type FileSets struct { 84 | Writable, Readable, Statable, SoftBan FileSet 85 | } 86 | 87 | // NewFileSets creates new FileSets struct 88 | func NewFileSets() *FileSets { 89 | return &FileSets{NewFileSet(), NewFileSet(), NewFileSet(), NewFileSet()} 90 | } 91 | 92 | // IsWritableFile determines whether the file path inside the write set 93 | func (s *FileSets) IsWritableFile(name string) bool { 94 | return s.Writable.IsInSetSmart(name) || s.Writable.IsInSetSmart(realPath(name)) 95 | } 96 | 97 | // IsReadableFile determines whether the file path inside the read / write set 98 | func (s *FileSets) IsReadableFile(name string) bool { 99 | return s.IsWritableFile(name) || s.Readable.IsInSetSmart(name) || s.Readable.IsInSetSmart(realPath(name)) 100 | } 101 | 102 | // IsStatableFile determines whether the file path inside the stat / read / write set 103 | func (s *FileSets) IsStatableFile(name string) bool { 104 | return s.IsReadableFile(name) || s.Statable.IsInSetSmart(name) || s.Statable.IsInSetSmart(realPath(name)) 105 | } 106 | 107 | // IsSoftBanFile determines whether the file path inside the softban set 108 | func (s *FileSets) IsSoftBanFile(name string) bool { 109 | return s.SoftBan.IsInSetSmart(name) || s.SoftBan.IsInSetSmart(realPath(name)) 110 | } 111 | 112 | // AddFilePermission adds the file into fileSets according to the given permission 113 | func (s *FileSets) AddFilePermission(name string, mode FilePerm) { 114 | if mode == FilePermWrite { 115 | s.Writable.Add(name) 116 | } else if mode == FilePermRead { 117 | s.Readable.Add(name) 118 | } else if mode == FilePermStat { 119 | s.Statable.Add(name) 120 | } 121 | for name = dirname(name); name != ""; name = dirname(name) { 122 | s.Statable.Add(name) 123 | } 124 | } 125 | 126 | // GetExtraSet evaluates the concatenated file set according to real path or raw path 127 | func GetExtraSet(extra, raw []string) []string { 128 | rt := make([]string, 0, len(extra)+len(raw)) 129 | rt = append(rt, raw...) 130 | for _, v := range extra { 131 | rt = append(rt, realPath(v)) 132 | } 133 | return rt 134 | } 135 | 136 | // dirname return path without last "/" 137 | func dirname(path string) string { 138 | if p := strings.LastIndex(path, "/"); p >= 0 { 139 | return path[:p] 140 | } 141 | return "" 142 | } 143 | 144 | func realPath(p string) string { 145 | f, err := filepath.EvalSymlinks(p) 146 | if err != nil { 147 | return "" 148 | } 149 | return f 150 | } 151 | -------------------------------------------------------------------------------- /runner/ptrace/filehandler/fileset_test.go: -------------------------------------------------------------------------------- 1 | package filehandler 2 | 3 | import ( 4 | "maps" 5 | "testing" 6 | ) 7 | 8 | // Unit test for IsInSetSmart 9 | func TestFileSet_IsInSetSmart(t *testing.T) { 10 | // Create a new FileSet 11 | fs := NewFileSet() 12 | 13 | // Add paths to the FileSet 14 | fs.Add("/path/to/file") 15 | fs.Add("/path/to/dir/") 16 | fs.Add("/path/to/dir/*") 17 | fs.Add("/") 18 | 19 | // Test cases 20 | tests := []struct { 21 | name string 22 | input string 23 | expected bool 24 | }{ 25 | {"Exact match", "/path/to/file", true}, 26 | {"Directory match", "/path/to/dir", true}, 27 | {"Wildcard match", "/path/to/dir/subfile", true}, 28 | {"Root match", "/", true}, 29 | {"Non-existent path", "/non/existent/path", false}, 30 | } 31 | 32 | for _, test := range tests { 33 | t.Run(test.name, func(t *testing.T) { 34 | result := fs.IsInSetSmart(test.input) 35 | if result != test.expected { 36 | t.Errorf("IsInSetSmart(%q) = %v; expected %v", test.input, result, test.expected) 37 | } 38 | }) 39 | } 40 | } 41 | 42 | // Unit test for Add method 43 | func TestFileSet_Add(t *testing.T) { 44 | // Create a new FileSet 45 | fs := NewFileSet() 46 | 47 | if fs.SystemRoot { 48 | t.Errorf("NewFileSet() failed; expected SystemRoot to be false") 49 | } 50 | 51 | // Test adding a path that is not the root directory 52 | fs.Add("/path/to/file") 53 | if fs.SystemRoot { 54 | t.Errorf("Add(\"/path/to/file\") failed; expected SystemRoot to be false") 55 | } 56 | 57 | // Test adding the root directory 58 | fs.Add("/") 59 | if !fs.SystemRoot { 60 | t.Errorf("Add(\"/\") failed; expected SystemRoot to be true") 61 | } 62 | 63 | // Test adding a regular path 64 | fs.Add("/path/to/file") 65 | if !fs.Set["/path/to/file"] { 66 | t.Errorf("Add(\"/path/to/file\") failed; expected path to be in the set") 67 | } 68 | 69 | // Test adding another path 70 | fs.Add("/another/path") 71 | if !fs.Set["/another/path"] { 72 | t.Errorf("Add(\"/another/path\") failed; expected path to be in the set") 73 | } 74 | 75 | // Test adding a path with a trailing slash 76 | fs.Add("/path/to/dir/") 77 | if !fs.Set["/path/to/dir/"] { 78 | t.Errorf("Add(\"/path/to/dir/\") failed; expected path to be in the set") 79 | } 80 | 81 | // Test adding a path with a wildcard 82 | fs.Add("/path/to/dir/*") 83 | if !fs.Set["/path/to/dir/*"] { 84 | t.Errorf("Add(\"/path/to/dir/*\") failed; expected path to be in the set") 85 | } 86 | 87 | // Test adding a relative path 88 | fs.Add("relative/path") 89 | if !fs.Set["relative/path"] { 90 | t.Errorf("Add(\"relative/path\") failed; expected path to be in the set") 91 | } 92 | } 93 | 94 | // Unit test for AddRange method 95 | func TestFileSet_AddRange(t *testing.T) { 96 | // Create a new FileSet 97 | fs := NewFileSet() 98 | 99 | // Test cases 100 | tests := []struct { 101 | name string 102 | paths []string 103 | workPath string 104 | expected map[string]bool 105 | systemRoot bool 106 | }{ 107 | { 108 | name: "Add absolute paths", 109 | paths: []string{"/path/to/file", "/another/path"}, 110 | workPath: "/work/dir", 111 | expected: map[string]bool{ 112 | "/path/to/file": true, 113 | "/another/path": true, 114 | }, 115 | systemRoot: false, 116 | }, 117 | { 118 | name: "Add root directory", 119 | paths: []string{"/"}, 120 | workPath: "/work/dir", 121 | expected: map[string]bool{}, 122 | systemRoot: true, 123 | }, 124 | { 125 | name: "Add relative paths", 126 | paths: []string{"relative/path", "another/relative/path"}, 127 | workPath: "/work/dir", 128 | expected: map[string]bool{ 129 | "/work/dir/relative/path/": true, 130 | "/work/dir/another/relative/path/": true, 131 | }, 132 | systemRoot: false, 133 | }, 134 | } 135 | 136 | for _, test := range tests { 137 | t.Run(test.name, func(t *testing.T) { 138 | // Reset the FileSet 139 | fs = NewFileSet() 140 | 141 | // Call AddRange 142 | fs.AddRange(test.paths, test.workPath) 143 | 144 | // Check SystemRoot 145 | if fs.SystemRoot != test.systemRoot { 146 | t.Errorf("SystemRoot = %v; expected %v", fs.SystemRoot, test.systemRoot) 147 | } 148 | 149 | // Check the Set 150 | if !maps.Equal(fs.Set, test.expected) { 151 | t.Errorf("Set = %v; expected %v", fs.Set, test.expected) 152 | } 153 | }) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /runner/ptrace/filehandler/handle.go: -------------------------------------------------------------------------------- 1 | package filehandler 2 | 3 | import ( 4 | "github.com/criyle/go-sandbox/ptracer" 5 | ) 6 | 7 | // Handler defines file access restricted handler to call the ptrace 8 | // safe runner 9 | type Handler struct { 10 | FileSet *FileSets 11 | SyscallCounter SyscallCounter 12 | } 13 | 14 | // CheckRead checks whether the file have read permission 15 | func (h *Handler) CheckRead(fn string) ptracer.TraceAction { 16 | if !h.FileSet.IsReadableFile(fn) { 17 | return h.onDgsFileDetect(fn) 18 | } 19 | return ptracer.TraceAllow 20 | } 21 | 22 | // CheckWrite checks whether the file have write permission 23 | func (h *Handler) CheckWrite(fn string) ptracer.TraceAction { 24 | if !h.FileSet.IsWritableFile(fn) { 25 | return h.onDgsFileDetect(fn) 26 | } 27 | return ptracer.TraceAllow 28 | } 29 | 30 | // CheckStat checks whether the file have stat permission 31 | func (h *Handler) CheckStat(fn string) ptracer.TraceAction { 32 | if !h.FileSet.IsStatableFile(fn) { 33 | return h.onDgsFileDetect(fn) 34 | } 35 | return ptracer.TraceAllow 36 | } 37 | 38 | // CheckSyscall checks syscalls other than allowed and traced against the 39 | // SyscallCounter 40 | func (h *Handler) CheckSyscall(syscallName string) ptracer.TraceAction { 41 | // if it is traced, then try to count syscall 42 | if inside, allow := h.SyscallCounter.Check(syscallName); inside { 43 | if allow { 44 | return ptracer.TraceAllow 45 | } 46 | return ptracer.TraceKill 47 | } 48 | // if it is traced but not counted, it should be soft banned 49 | return ptracer.TraceBan 50 | } 51 | 52 | // onDgsFileDetect soft ban file if in soft ban set 53 | // otherwise stops the trace process 54 | func (h *Handler) onDgsFileDetect(name string) ptracer.TraceAction { 55 | if h.FileSet.IsSoftBanFile(name) { 56 | return ptracer.TraceBan 57 | } 58 | return ptracer.TraceKill 59 | } 60 | -------------------------------------------------------------------------------- /runner/ptrace/filehandler/syscallcounter.go: -------------------------------------------------------------------------------- 1 | package filehandler 2 | 3 | // SyscallCounter defines a count-down for each each syscall occurs 4 | type SyscallCounter map[string]int 5 | 6 | // NewSyscallCounter creates a new SyscallCounter 7 | func NewSyscallCounter() SyscallCounter { 8 | return SyscallCounter(make(map[string]int)) 9 | } 10 | 11 | // Add adds single counter to SyscallCounter 12 | func (s SyscallCounter) Add(name string, count int) { 13 | s[name] = count 14 | } 15 | 16 | // AddRange add multiple counter to SyscallCounter 17 | func (s SyscallCounter) AddRange(m map[string]int) { 18 | for k, v := range m { 19 | s[k] = v 20 | } 21 | } 22 | 23 | // Check return inside, allow 24 | func (s SyscallCounter) Check(name string) (bool, bool) { 25 | n, o := s[name] 26 | if o { 27 | s[name] = n - 1 28 | if n <= 1 { 29 | return true, false 30 | } 31 | return true, true 32 | } 33 | return false, true 34 | } 35 | -------------------------------------------------------------------------------- /runner/ptrace/handle_linux.go: -------------------------------------------------------------------------------- 1 | package ptrace 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "syscall" 8 | 9 | "github.com/criyle/go-sandbox/pkg/seccomp/libseccomp" 10 | "github.com/criyle/go-sandbox/ptracer" 11 | ) 12 | 13 | type tracerHandler struct { 14 | ShowDetails, Unsafe bool 15 | Handler Handler 16 | } 17 | 18 | func (h *tracerHandler) Debug(v ...interface{}) { 19 | if h.ShowDetails { 20 | fmt.Fprintln(os.Stderr, v...) 21 | } 22 | } 23 | 24 | func (h *tracerHandler) getString(ctx *ptracer.Context, addr uint) string { 25 | return absPath(ctx.Pid, ctx.GetString(uintptr(addr))) 26 | } 27 | 28 | func (h *tracerHandler) checkOpen(ctx *ptracer.Context, addr uint, flags uint) ptracer.TraceAction { 29 | fn := h.getString(ctx, addr) 30 | isReadOnly := (flags&syscall.O_ACCMODE == syscall.O_RDONLY) && 31 | (flags&syscall.O_CREAT == 0) && 32 | (flags&syscall.O_EXCL == 0) && 33 | (flags&syscall.O_TRUNC == 0) 34 | 35 | h.Debug("open: ", fn, getFileMode(flags)) 36 | if isReadOnly { 37 | return h.Handler.CheckRead(fn) 38 | } 39 | return h.Handler.CheckWrite(fn) 40 | } 41 | 42 | func (h *tracerHandler) checkRead(ctx *ptracer.Context, addr uint) ptracer.TraceAction { 43 | fn := h.getString(ctx, addr) 44 | h.Debug("check read: ", fn) 45 | return h.Handler.CheckRead(fn) 46 | } 47 | 48 | func (h *tracerHandler) checkWrite(ctx *ptracer.Context, addr uint) ptracer.TraceAction { 49 | fn := h.getString(ctx, addr) 50 | h.Debug("check write: ", fn) 51 | return h.Handler.CheckWrite(fn) 52 | } 53 | 54 | func (h *tracerHandler) checkStat(ctx *ptracer.Context, addr uint) ptracer.TraceAction { 55 | fn := h.getString(ctx, addr) 56 | h.Debug("check stat: ", fn) 57 | return h.Handler.CheckStat(fn) 58 | } 59 | 60 | func (h *tracerHandler) Handle(ctx *ptracer.Context) ptracer.TraceAction { 61 | syscallNo := ctx.SyscallNo() 62 | syscallName, err := libseccomp.ToSyscallName(syscallNo) 63 | h.Debug("syscall:", syscallNo, syscallName, err) 64 | if err != nil { 65 | h.Debug("invalid syscall no") 66 | return ptracer.TraceKill 67 | } 68 | 69 | action := ptracer.TraceKill 70 | switch syscallName { 71 | case "open": 72 | action = h.checkOpen(ctx, ctx.Arg0(), ctx.Arg1()) 73 | case "openat": 74 | action = h.checkOpen(ctx, ctx.Arg1(), ctx.Arg2()) 75 | 76 | case "readlink": 77 | action = h.checkRead(ctx, ctx.Arg0()) 78 | case "readlinkat": 79 | action = h.checkRead(ctx, ctx.Arg1()) 80 | 81 | case "unlink": 82 | action = h.checkWrite(ctx, ctx.Arg0()) 83 | case "unlinkat": 84 | action = h.checkWrite(ctx, ctx.Arg1()) 85 | 86 | case "access": 87 | action = h.checkStat(ctx, ctx.Arg0()) 88 | case "faccessat", "newfstatat": 89 | action = h.checkStat(ctx, ctx.Arg1()) 90 | 91 | case "stat", "stat64": 92 | action = h.checkStat(ctx, ctx.Arg0()) 93 | case "lstat", "lstat64": 94 | action = h.checkStat(ctx, ctx.Arg0()) 95 | 96 | case "execve": 97 | action = h.checkRead(ctx, ctx.Arg0()) 98 | case "execveat": 99 | action = h.checkRead(ctx, ctx.Arg1()) 100 | 101 | case "chmod": 102 | action = h.checkWrite(ctx, ctx.Arg0()) 103 | case "rename": 104 | action = h.checkWrite(ctx, ctx.Arg0()) 105 | 106 | default: 107 | action = h.Handler.CheckSyscall(syscallName) 108 | if h.Unsafe && action == ptracer.TraceKill { 109 | action = ptracer.TraceBan 110 | } 111 | } 112 | 113 | switch action { 114 | case ptracer.TraceAllow: 115 | return ptracer.TraceAllow 116 | case ptracer.TraceBan: 117 | h.Debug("") 118 | return softBanSyscall(ctx) 119 | default: 120 | return ptracer.TraceKill 121 | } 122 | } 123 | 124 | func softBanSyscall(ctx *ptracer.Context) ptracer.TraceAction { 125 | ctx.SetReturnValue(-int(BanRet)) 126 | return ptracer.TraceBan 127 | } 128 | 129 | func getFileMode(flags uint) string { 130 | switch flags & syscall.O_ACCMODE { 131 | case syscall.O_RDONLY: 132 | return "r " 133 | case syscall.O_WRONLY: 134 | return "w " 135 | case syscall.O_RDWR: 136 | return "wr" 137 | default: 138 | return "??" 139 | } 140 | } 141 | 142 | // getProcCwd gets the process CWD 143 | func getProcCwd(pid int) string { 144 | fileName := "/proc/self/cwd" 145 | if pid > 0 { 146 | fileName = fmt.Sprintf("/proc/%d/cwd", pid) 147 | } 148 | s, err := os.Readlink(fileName) 149 | if err != nil { 150 | return "" 151 | } 152 | return s 153 | } 154 | 155 | // absPath calculates the absolute path for a process 156 | // built-in function did the dirty works to resolve relative paths 157 | func absPath(pid int, p string) string { 158 | // if relative path 159 | if !filepath.IsAbs(p) { 160 | return filepath.Join(getProcCwd(pid), p) 161 | } 162 | return filepath.Clean(p) 163 | } 164 | -------------------------------------------------------------------------------- /runner/ptrace/run_linux.go: -------------------------------------------------------------------------------- 1 | package ptrace 2 | 3 | import ( 4 | "context" 5 | "os" 6 | 7 | "github.com/criyle/go-sandbox/pkg/forkexec" 8 | "github.com/criyle/go-sandbox/ptracer" 9 | "github.com/criyle/go-sandbox/runner" 10 | ) 11 | 12 | // Run starts the tracing process 13 | func (r *Runner) Run(c context.Context) runner.Result { 14 | ch := &forkexec.Runner{ 15 | Args: r.Args, 16 | Env: r.Env, 17 | ExecFile: r.ExecFile, 18 | RLimits: r.RLimits, 19 | Files: r.Files, 20 | WorkDir: r.WorkDir, 21 | Seccomp: r.Seccomp.SockFprog(), 22 | Ptrace: true, 23 | SyncFunc: r.SyncFunc, 24 | 25 | UnshareCgroupAfterSync: os.Getuid() == 0, 26 | } 27 | 28 | th := &tracerHandler{ 29 | ShowDetails: r.ShowDetails, 30 | Unsafe: r.Unsafe, 31 | Handler: r.Handler, 32 | } 33 | 34 | tracer := ptracer.Tracer{ 35 | Handler: th, 36 | Runner: ch, 37 | Limit: r.Limit, 38 | } 39 | return tracer.Trace(c) 40 | } 41 | -------------------------------------------------------------------------------- /runner/ptrace/runner_linux.go: -------------------------------------------------------------------------------- 1 | package ptrace 2 | 3 | import ( 4 | "syscall" 5 | 6 | "github.com/criyle/go-sandbox/pkg/rlimit" 7 | "github.com/criyle/go-sandbox/pkg/seccomp" 8 | "github.com/criyle/go-sandbox/ptracer" 9 | "github.com/criyle/go-sandbox/runner" 10 | ) 11 | 12 | // Runner defines the spec to run a program safely by ptracer 13 | type Runner struct { 14 | // argv and env for the child process 15 | // work path set by setcwd (current working directory for child) 16 | Args []string 17 | Env []string 18 | WorkDir string 19 | 20 | // fexecve 21 | ExecFile uintptr 22 | 23 | // file descriptors for new process, from 0 to len - 1 24 | Files []uintptr 25 | 26 | // Resource limit set by set rlimit 27 | RLimits []rlimit.RLimit 28 | 29 | // Res limit enforced by tracer 30 | Limit runner.Limit 31 | 32 | // Defines seccomp filter for the ptrace runner 33 | // file access syscalls need to set as ActionTrace 34 | // allowed need to set as ActionAllow 35 | // default action should be ActionTrace / ActionKill 36 | Seccomp seccomp.Filter 37 | 38 | // Traced syscall handler 39 | Handler Handler 40 | 41 | // ShowDetails / Unsafe debug flag 42 | ShowDetails, Unsafe bool 43 | 44 | // Use by cgroup to add proc 45 | SyncFunc func(pid int) error 46 | } 47 | 48 | // BanRet defines the return value for a syscall ban action 49 | var BanRet = syscall.EACCES 50 | 51 | // Handler defines the action when a file access encountered 52 | type Handler interface { 53 | CheckRead(string) ptracer.TraceAction 54 | CheckWrite(string) ptracer.TraceAction 55 | CheckStat(string) ptracer.TraceAction 56 | CheckSyscall(string) ptracer.TraceAction 57 | } 58 | -------------------------------------------------------------------------------- /runner/result.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | // Result is the program runner result 9 | type Result struct { 10 | Status // result status 11 | ExitStatus int // exit status (signal number if signalled) 12 | Error string // potential detailed error message (for program runner error) 13 | 14 | Time time.Duration // used user CPU time (underlying type int64 in ns) 15 | Memory Size // used user memory (underlying type uint64 in bytes) 16 | ProcPeak uint64 // maximum processes 17 | 18 | // metrics for the program runner 19 | SetUpTime time.Duration 20 | RunningTime time.Duration 21 | } 22 | 23 | func (r Result) String() string { 24 | switch r.Status { 25 | case StatusNormal: 26 | return fmt.Sprintf("Result[%v %v][%v %v]", r.Time, r.Memory, r.SetUpTime, r.RunningTime) 27 | 28 | case StatusSignalled: 29 | return fmt.Sprintf("Result[Signalled(%d)][%v %v][%v %v]", r.ExitStatus, r.Time, r.Memory, r.SetUpTime, r.RunningTime) 30 | 31 | case StatusRunnerError: 32 | return fmt.Sprintf("Result[RunnerFailed(%s)][%v %v][%v %v]", r.Error, r.Time, r.Memory, r.SetUpTime, r.RunningTime) 33 | 34 | default: 35 | return fmt.Sprintf("Result[%v(%s %d)][%v %v][%v %v]", r.Status, r.Error, r.ExitStatus, r.Time, r.Memory, r.SetUpTime, r.RunningTime) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /runner/runner.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | import ( 4 | "context" 5 | ) 6 | 7 | // Runner interface defines method to start running 8 | type Runner interface { 9 | Run(context.Context) Result 10 | } 11 | -------------------------------------------------------------------------------- /runner/size.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | ) 7 | 8 | // Size stores number of byte for the object. E.g. Memory. 9 | // Maximum size is bounded by 64-bit limit 10 | type Size uint64 11 | 12 | // String stringer interface for print 13 | func (s Size) String() string { 14 | t := uint64(s) 15 | switch { 16 | case t < 1<<10: 17 | return fmt.Sprintf("%d B", t) 18 | case t < 1<<20: 19 | return fmt.Sprintf("%.1f KiB", float64(t)/float64(1<<10)) 20 | case t < 1<<30: 21 | return fmt.Sprintf("%.1f MiB", float64(t)/float64(1<<20)) 22 | default: 23 | return fmt.Sprintf("%.1f GiB", float64(t)/float64(1<<30)) 24 | } 25 | } 26 | 27 | // Set parse the size value from string 28 | func (s *Size) Set(str string) error { 29 | switch str[len(str)-1] { 30 | case 'b', 'B': 31 | str = str[:len(str)-1] 32 | } 33 | 34 | factor := 0 35 | switch str[len(str)-1] { 36 | case 'k', 'K': 37 | factor = 10 38 | str = str[:len(str)-1] 39 | case 'm', 'M': 40 | factor = 20 41 | str = str[:len(str)-1] 42 | case 'g', 'G': 43 | factor = 30 44 | str = str[:len(str)-1] 45 | } 46 | 47 | t, err := strconv.Atoi(str) 48 | if err != nil { 49 | return err 50 | } 51 | *s = Size(t << factor) 52 | return nil 53 | } 54 | 55 | // Byte return size in bytes 56 | func (s Size) Byte() uint64 { 57 | return uint64(s) 58 | } 59 | 60 | // KiB return size in KiB 61 | func (s Size) KiB() uint64 { 62 | return uint64(s) >> 10 63 | } 64 | 65 | // MiB return size in MiB 66 | func (s Size) MiB() uint64 { 67 | return uint64(s) >> 20 68 | } 69 | 70 | // GiB return size in GiB 71 | func (s Size) GiB() uint64 { 72 | return uint64(s) >> 30 73 | } 74 | 75 | // TiB return size in TiB 76 | func (s Size) TiB() uint64 { 77 | return uint64(s) >> 40 78 | } 79 | 80 | // PiB return size in PiB 81 | func (s Size) PiB() uint64 { 82 | return uint64(s) >> 50 83 | } 84 | 85 | // EiB return size in EiB 86 | func (s Size) EiB() uint64 { 87 | return uint64(s) >> 60 88 | } 89 | -------------------------------------------------------------------------------- /runner/status.go: -------------------------------------------------------------------------------- 1 | package runner 2 | 3 | // Status is the result Status 4 | type Status int 5 | 6 | // Result Status for program runner 7 | const ( 8 | StatusInvalid Status = iota // 0 not initialized 9 | // Normal 10 | StatusNormal // 1 normal 11 | 12 | // Resource Limit Exceeded 13 | StatusTimeLimitExceeded // 2 tle 14 | StatusMemoryLimitExceeded // 3 mle 15 | StatusOutputLimitExceeded // 4 ole 16 | 17 | // Unauthorized Access 18 | StatusDisallowedSyscall // 5 ban 19 | 20 | // Runtime Error 21 | StatusSignalled // 6 signalled 22 | StatusNonzeroExitStatus // 7 nonzero exit status 23 | 24 | // Programmer Runner Error 25 | StatusRunnerError // 8 runner error 26 | ) 27 | 28 | var ( 29 | statusString = []string{ 30 | "Invalid", 31 | "", 32 | "Time Limit Exceeded", 33 | "Memory Limit Exceeded", 34 | "Output Limit Exceeded", 35 | "Disallowed Syscall", 36 | "Signalled", 37 | "Nonzero Exit Status", 38 | "Runner Error", 39 | } 40 | ) 41 | 42 | func (t Status) String() string { 43 | i := int(t) 44 | if i >= 0 && i < len(statusString) { 45 | return statusString[i] 46 | } 47 | return statusString[0] 48 | } 49 | 50 | func (t Status) Error() string { 51 | return t.String() 52 | } 53 | -------------------------------------------------------------------------------- /runner/unshare/doc.go: -------------------------------------------------------------------------------- 1 | // Package unshare implements runner that uses Linux unshare syscall & mount namespace & rlimit 2 | // to restrict program access 3 | package unshare 4 | -------------------------------------------------------------------------------- /runner/unshare/run_linux.go: -------------------------------------------------------------------------------- 1 | package unshare 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "time" 8 | 9 | "golang.org/x/sys/unix" 10 | 11 | "github.com/criyle/go-sandbox/pkg/forkexec" 12 | "github.com/criyle/go-sandbox/runner" 13 | ) 14 | 15 | const ( 16 | // UnshareFlags is flags used to create namespaces except NET and IPC 17 | UnshareFlags = unix.CLONE_NEWNS | unix.CLONE_NEWPID | unix.CLONE_NEWUSER | unix.CLONE_NEWUTS | unix.CLONE_NEWCGROUP 18 | ) 19 | 20 | // Run starts the unshared process 21 | func (r *Runner) Run(c context.Context) (result runner.Result) { 22 | ch := &forkexec.Runner{ 23 | Args: r.Args, 24 | Env: r.Env, 25 | ExecFile: r.ExecFile, 26 | RLimits: r.RLimits, 27 | Files: r.Files, 28 | WorkDir: r.WorkDir, 29 | Seccomp: r.Seccomp.SockFprog(), 30 | NoNewPrivs: true, 31 | CloneFlags: UnshareFlags, 32 | Mounts: r.Mounts, 33 | HostName: r.HostName, 34 | DomainName: r.DomainName, 35 | PivotRoot: r.Root, 36 | DropCaps: true, 37 | SyncFunc: r.SyncFunc, 38 | 39 | UnshareCgroupAfterSync: true, 40 | } 41 | 42 | var ( 43 | wstatus unix.WaitStatus // wait4 wait status 44 | rusage unix.Rusage // wait4 rusage 45 | status = runner.StatusNormal 46 | sTime = time.Now() // start time 47 | fTime time.Time // finish time for setup 48 | ) 49 | 50 | // Start the runner 51 | pgid, err := ch.Start() 52 | r.println("Starts: ", pgid, err) 53 | if err != nil { 54 | result.Status = runner.StatusRunnerError 55 | result.Error = err.Error() 56 | return 57 | } 58 | 59 | ctx, cancel := context.WithCancel(c) 60 | defer cancel() 61 | 62 | // handle cancel 63 | go func() { 64 | <-ctx.Done() 65 | killAll(pgid) 66 | }() 67 | 68 | // kill all tracee upon return 69 | defer func() { 70 | killAll(pgid) 71 | collectZombie(pgid) 72 | result.SetUpTime = fTime.Sub(sTime) 73 | result.RunningTime = time.Since(fTime) 74 | }() 75 | 76 | fTime = time.Now() 77 | for { 78 | _, err := unix.Wait4(pgid, &wstatus, 0, &rusage) 79 | if err == unix.EINTR { 80 | continue 81 | } 82 | r.println("wait4: ", wstatus) 83 | if err != nil { 84 | result.Status = runner.StatusRunnerError 85 | result.Error = err.Error() 86 | return 87 | } 88 | 89 | // update resource usage and check against limits 90 | userTime := time.Duration(rusage.Utime.Nano()) // ns 91 | userMem := runner.Size(rusage.Maxrss << 10) // bytes 92 | 93 | // check tle / mle 94 | if userTime > r.Limit.TimeLimit { 95 | status = runner.StatusTimeLimitExceeded 96 | } 97 | if userMem > r.Limit.MemoryLimit { 98 | status = runner.StatusMemoryLimitExceeded 99 | } 100 | result = runner.Result{ 101 | Status: status, 102 | Time: userTime, 103 | Memory: userMem, 104 | } 105 | if status != runner.StatusNormal { 106 | return 107 | } 108 | 109 | switch { 110 | case wstatus.Exited(): 111 | result.Status = runner.StatusNormal 112 | result.ExitStatus = wstatus.ExitStatus() 113 | if result.ExitStatus != 0 { 114 | result.Status = runner.StatusNonzeroExitStatus 115 | } 116 | return 117 | 118 | case wstatus.Signaled(): 119 | sig := wstatus.Signal() 120 | switch sig { 121 | case unix.SIGXCPU, unix.SIGKILL: 122 | status = runner.StatusTimeLimitExceeded 123 | case unix.SIGXFSZ: 124 | status = runner.StatusOutputLimitExceeded 125 | case unix.SIGSYS: 126 | status = runner.StatusDisallowedSyscall 127 | default: 128 | status = runner.StatusSignalled 129 | } 130 | result.Status = status 131 | result.ExitStatus = int(sig) 132 | return 133 | } 134 | } 135 | } 136 | 137 | // kill all tracee according to pids 138 | func killAll(pgid int) { 139 | unix.Kill(-pgid, unix.SIGKILL) 140 | } 141 | 142 | // collect died child processes 143 | func collectZombie(pgid int) { 144 | var wstatus unix.WaitStatus 145 | for { 146 | if _, err := unix.Wait4(-pgid, &wstatus, unix.WALL|unix.WNOHANG, nil); err != unix.EINTR && err != nil { 147 | break 148 | } 149 | } 150 | } 151 | 152 | func (r *Runner) println(v ...interface{}) { 153 | if r.ShowDetails { 154 | fmt.Fprintln(os.Stderr, v...) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /runner/unshare/runner_linux.go: -------------------------------------------------------------------------------- 1 | package unshare 2 | 3 | import ( 4 | "github.com/criyle/go-sandbox/pkg/mount" 5 | "github.com/criyle/go-sandbox/pkg/rlimit" 6 | "github.com/criyle/go-sandbox/pkg/seccomp" 7 | "github.com/criyle/go-sandbox/runner" 8 | ) 9 | 10 | // Runner runs program in unshared namespaces 11 | type Runner struct { 12 | // argv and env for the child process 13 | Args []string 14 | Env []string 15 | 16 | // fexecve param 17 | ExecFile uintptr 18 | 19 | // workdir is the current dir after unshare mount namespaces 20 | WorkDir string 21 | 22 | // file descriptors for new process, from 0 to len - 1 23 | Files []uintptr 24 | 25 | // Resource limit set by set rlimit 26 | RLimits []rlimit.RLimit 27 | 28 | // Resource limit enforced by tracer 29 | Limit runner.Limit 30 | 31 | // Seccomp defines the seccomp filter attach to the process (should be whitelist only) 32 | Seccomp seccomp.Filter 33 | 34 | // New root 35 | Root string 36 | 37 | // Mount syscalls 38 | Mounts []mount.SyscallParams 39 | 40 | // hostname & domainname 41 | HostName, DomainName string 42 | 43 | // Show Details 44 | ShowDetails bool 45 | 46 | // Use by cgroup to add proc 47 | SyncFunc func(pid int) error 48 | } 49 | --------------------------------------------------------------------------------