├── .gitattributes ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── build.zig ├── labo ├── README.md ├── basic │ ├── README.md │ ├── clone.zig │ ├── create_new_file.zig │ ├── forkexecv.zig │ ├── hello.zig │ ├── namespace.zig │ ├── randstr.zig │ ├── return_from_switch.zig │ ├── simpleforkunshareexecv.zig │ ├── structfieldsiterate.zig │ ├── tmpfile.zig │ ├── userns_child_exec │ └── userns_child_exec.c ├── mount_procfs │ ├── README.md │ └── strace.txt └── multipass │ ├── README.md │ └── launch.sh ├── prepare.sh ├── src ├── CgroupsManager.zig ├── OciSpec.zig ├── StateManager.zig ├── main.zig ├── syscall.zig └── util.zig └── testdata ├── sample_spec_linux.json └── state.json /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.zig text eol=lf 3 | zigmod.* text eol=lf 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This file is for zig-specific build artifacts. 2 | # If you have OS-specific or editor-specific files to ignore, 3 | # such as *.swp or .DS_Store, put those in your global 4 | # ~/.gitignore and put this in your ~/.gitconfig: 5 | # 6 | # [core] 7 | # excludesfile = ~/.gitignore 8 | # 9 | # Cheers! 10 | # -andrewrk 11 | 12 | zig-cache/ 13 | zig-out/ 14 | /release/ 15 | /debug/ 16 | /build/ 17 | /build-*/ 18 | /docgen_tmp/ 19 | cloud-config.yaml 20 | .zigmod 21 | deps.zig 22 | 23 | # VSCode related files 24 | .vscode/* 25 | 26 | # Local History for Visual Studio Code 27 | .history/ 28 | 29 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/zig-arg"] 2 | path = lib/zig-arg 3 | url = https://github.com/PrajwalCH/zig-arg 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Kotaro Inoue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build 2 | build: 3 | zig build -p ./zig-out 4 | 5 | test: 6 | zig build test 7 | 8 | clean: 9 | rm -rf ./zig-out/* 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # runzigc 2 | 3 | A toy container runtime written in Zig. 4 | 5 | CAUTION: still under active development, not production ready. 6 | 7 | ## Prerequisites 8 | 9 | CAUTION: Zig language is still under active development, so building with the latest zig release might fail. 10 | 11 | - [zig v0.9.1](https://github.com/ziglang/zig/releases/tag/0.9.1) 12 | 13 | ## Build 14 | 15 | Before build, you need to fetch submodules. 16 | 17 | ``` 18 | $ git submodule update --init --recursive 19 | ``` 20 | 21 | ### Debug build 22 | 23 | ``` 24 | zig build 25 | ``` 26 | 27 | `zig-out/bin/runzigc` is the built binary. 28 | 29 | ### Release build 30 | 31 | ``` 32 | zig build -Drelease-safe 33 | ``` 34 | 35 | ## Similar projects 36 | 37 | - [opencontainers/runc](https://github.com/opencontainers/runc) CLI tool for spawning and running containers according to the OCI specification 38 | - [containers/youki](https://github.com/containers/youki) A container runtime written in Rust 39 | - [containers/crun](https://github.com/containers/crun) A fast and lightweight fully featured OCI runtime and C library for running containers 40 | - [fancl20/zrun](https://github.com/fancl20/zrun) A fast and low-memory footprint (non-standard) container runtime fully written in Zig. 41 | -------------------------------------------------------------------------------- /build.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub fn build(b: *std.build.Builder) void { 4 | // Standard target options allows the person running `zig build` to choose 5 | // what target to build for. Here we do not override the defaults, which 6 | // means any target is allowed, and the default is native. Other options 7 | // for restricting supported target set are available. 8 | const target = b.standardTargetOptions(.{}); 9 | 10 | // Standard release options allow the person running `zig build` to select 11 | // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. 12 | const mode = b.standardReleaseOptions(); 13 | 14 | const exe = b.addExecutable("runzigc", "src/main.zig"); 15 | exe.setTarget(target); 16 | exe.setBuildMode(mode); 17 | 18 | // Dependencies 19 | exe.addPackagePath("zig-arg", "lib/zig-arg/src/lib.zig"); 20 | 21 | exe.install(); 22 | 23 | const run_cmd = exe.run(); 24 | run_cmd.step.dependOn(b.getInstallStep()); 25 | if (b.args) |args| { 26 | run_cmd.addArgs(args); 27 | } 28 | 29 | const run_step = b.step("run", "Run the app"); 30 | run_step.dependOn(&run_cmd.step); 31 | 32 | const exe_tests = b.addTest("src/main.zig"); 33 | exe_tests.setTarget(target); 34 | exe_tests.setBuildMode(mode); 35 | 36 | // Test dependencies 37 | exe_tests.addPackagePath("zig-arg", "lib/zig-arg/src/lib.zig"); 38 | 39 | const test_step = b.step("test", "Run unit tests"); 40 | test_step.dependOn(&exe_tests.step); 41 | } 42 | -------------------------------------------------------------------------------- /labo/README.md: -------------------------------------------------------------------------------- 1 | # labo 2 | 3 | a.k.a playground 4 | -------------------------------------------------------------------------------- /labo/basic/README.md: -------------------------------------------------------------------------------- 1 | ## PID namespace 2 | 3 | PID namespace creation by unshare https://qiita.com/Ewokkkkk/items/f2fc09d09584bcb135da 4 | 5 | ``` 6 | $ sudo unshare --pid --mount-proc --fork /bin/bash 7 | ``` 8 | 9 | - `--pid`: isolate PID namespace 10 | - `--mount-proc`: mount `/proc`, without this option it will output the information of **parent** PID namespace. 11 | - `--fork`: forking 12 | 13 | ## UTS (Unix Time-Sharing) namespace (hostname isolation) 14 | 15 | btw what is UTS? 16 | 17 | > It means the process has a separate copy of the hostname and the (now mostly unused) NIS domain name, so it can set it to something else without affecting the rest of the system. 18 | >The hostname is set via sethostname and is the nodename member of the struct returned by uname. The NIS domain name is set by setdomainname and is the domainname member of the struct returned by uname. 19 | https://unix.stackexchange.com/questions/183717/whats-a-uts-namespace 20 | 21 | ``` 22 | $ sudo unshare -u /bin/bash 23 | ``` 24 | 25 | - `-u`: isolate UTS Namespace 26 | 27 | ## capability 28 | without any capability option 29 | 30 | ``` 31 | $ /home/mssn/.linuxbrew/bin/zig run namespace.zig 32 | parent pid: 678 33 | child pid: 745 34 | uid: 65534 35 | gid: 65534 36 | $ getpcap 745 37 | /bin/sh: 1: getpcap: not found 38 | $ getpcaps 745 39 | 745: = 40 | $ getpcaps 678 41 | 678: = 42 | $ cat /proc/678/status | grep Cap 43 | CapInh: 0000000000000000 44 | CapPrm: 0000000000000000 45 | CapEff: 0000000000000000 46 | CapBnd: 0000003fffffffff 47 | CapAmb: 0000000000000000 48 | $ cat /proc/745/status | grep Cap 49 | CapInh: 0000000000000000 50 | CapPrm: 0000000000000000 51 | CapEff: 0000000000000000 52 | CapBnd: 0000003fffffffff 53 | CapAmb: 0000000000000000 54 | $ capsh --decode=0000003fffffffff 55 | 0x0000003fffffffff=cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read 56 | ``` 57 | 58 | CapEffが0だからダメじゃんねこれ 59 | 60 | cap_setuid and cap_setgid exists by default. 61 | 62 | ## troubleshooting 63 | 64 | なぜかuid_mapにoperation not permittedとなって書き込めない。 65 | 66 | > gid_mapについても同様ですが、 Linux 3.19 からプロセスの補助グループを設定する setgroups の権限と、gid_mapの設定権限が排他となったため、 setgroups が有効な場合には、先にそちらを無効にする必要があります。 67 | プロセスに対する setgroups 権限は、同様に /proc ファイルシステム上のファイルを使い、確認/設定することができます。 68 | https://tech.retrieva.jp/entry/2019/06/04/130134 69 | 70 | マジ? 71 | 72 | ``` 73 | $ sudo /home/mssn/.linuxbrew/bin/zig run namespace.zig 74 | parent pid: 10086 75 | child pid: 10153 76 | uid: 65534 77 | gid: 65534 78 | uid_map_path: /proc/10153/uid_map 79 | gid_map_path: /proc/10153/gid_map 80 | $ echo deny > /proc/10153/setgroups 81 | /bin/sh: 1: cannot create /proc/10153/setgroups: Permission denied 82 | $ echo deny >> /proc/10153/setgroups 83 | /bin/sh: 2: cannot create /proc/10153/setgroups: Permission denied 84 | ``` 85 | 86 | あっ、そういうことかこれか 87 | 88 | > また、User名前空間の中のプロセスが自身のuid_map/gid_mapに書き込むことはできず、かならず親User名前空間のプロセスから書き込む必要があります。 89 | C++などで実装しているときには、 fork (2) 後に子プロセス、親プロセスそれぞれに制御が移るため実装は楽ですが、シェルスクリプトだと別のシェルを開いたり、バックグラウンドプロセスを作ってpidをやりとりする、などの工夫が必要になります。 90 | https://tech.retrieva.jp/entry/2019/06/04/130134#%E3%81%84%E3%81%96%E5%AE%9F%E8%B7%B5-%E3%81%A8%E3%82%8A%E3%81%82%E3%81%88%E3%81%9Aroot%E3%81%AB%E3%81%AA%E3%81%A3%E3%81%A6%E3%81%BF%E3%82%8B 91 | 92 | > 1. 93 | >書き込みプロセスは、 プロセス pid のユーザー名前空間で CAP_SETUID (CAP_SETGID) ケーパビリティを持っていなければならない。 94 | >2. 95 | 書き込みプロセスは、 プロセス pid のユーザー名前空間もしくはプロセス pid の親のユーザー名前空間に属していなければならない。 96 | >3. 97 | マッピングされたユーザー ID (グループ ID) は親のユーザー名前空間にマッピングを持っていなければならない。 98 | >4. 99 | 以下のいずれか一つが真である。 100 | >* 101 | uid_map (gid_map) に書き込まれるデータは、 書き込みを行うプロセスの親のユーザー名前空間でのファイルシステムユーザー ID (グループ ID) をそのユーザー名前空間でのユーザー ID (グループ ID) にマッピングする 1 行で構成されている。 102 | >* 103 | オープンしたプロセスが親のユーザー名前空間で CAP_SETUID (CAP_SETGID) ケーパビリティを持っている。 したがって、 特権プロセスは親のユーザー名前空間の任意のユーザー ID (グループ ID) に対するマッピングを作成できる。 104 | 上記のルールを満たさない書き込みはエラー EPERM で失敗する。 105 | https://linuxjm.osdn.jp/html/LDP_man-pages/man7/user_namespaces.7.html 106 | 107 | https://www.slideshare.net/AkihiroSuda/container-runtime-meetup-runc-user-namespaces 108 | 109 | 大前提→親プロセスはroot権限で動作する必要あり(rootlessにしないなら) 110 | 111 | -------------------------------------------------------------------------------- /labo/basic/clone.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const os = std.os; 3 | const linux = os.linux; 4 | const print = std.debug.print; 5 | const assert = std.debug.assert; 6 | 7 | fn parent(_: usize) !void { 8 | print("parent\n", .{}); 9 | os.exit(0); 10 | } 11 | 12 | fn child(_: usize) callconv(.C) u8 { 13 | print("child\n", .{}); 14 | const child_args = [_:null]?[*:0]const u8{ "/bin/echo", "hello world", null }; 15 | const envp = [_:null]?[*:0]const u8{null}; 16 | os.execveZ("/bin/echo", &child_args, &envp) catch return 1; 17 | return 0; 18 | } 19 | 20 | const STACK_SIZE = 1024 * 1024; 21 | 22 | // NOT WORKING, need to fix 23 | pub fn main() !void { 24 | // reference: https://github.com/ziglang/zig/blob/1a16b7214d88261f0e38b7ca4d15bcd76caaec4c/lib/std/Thread.zig#L896 25 | const page_size = std.mem.page_size; 26 | 27 | var guard_offset: usize = undefined; 28 | var stack_offset: usize = undefined; 29 | var tls_offset: usize = undefined; 30 | //var instance_offset: usize = undefined; 31 | 32 | const map_bytes = blk: { 33 | var bytes: usize = page_size; 34 | guard_offset = bytes; 35 | 36 | bytes += std.math.max(page_size, STACK_SIZE); 37 | bytes = std.mem.alignForward(bytes, page_size); 38 | stack_offset = bytes; 39 | 40 | bytes = std.mem.alignForward(bytes, linux.tls.tls_image.alloc_align); 41 | tls_offset = bytes; 42 | bytes += linux.tls.tls_image.alloc_size; 43 | 44 | // bytes = std.mem.alignForward(bytes, @alignOf(fn (usize) callconv(.C) u8)); 45 | // instance_offset = bytes; 46 | // bytes += @sizeOf(fn (usize) callconv(.C) u8); 47 | // 48 | bytes = std.mem.alignForward(bytes, page_size); 49 | break :blk bytes; 50 | }; 51 | 52 | // map all memory needed without read/write permissions 53 | // to avoid committing the whole region right away 54 | const mapped = os.mmap( 55 | null, 56 | map_bytes, 57 | os.PROT.NONE, 58 | os.MAP.PRIVATE | os.MAP.ANONYMOUS, 59 | -1, 60 | 0, 61 | ) catch |err| switch (err) { 62 | error.MemoryMappingNotSupported => unreachable, 63 | error.AccessDenied => unreachable, 64 | error.PermissionDenied => unreachable, 65 | else => |e| return e, 66 | }; 67 | assert(mapped.len >= map_bytes); 68 | errdefer os.munmap(mapped); 69 | var tls_ptr = os.linux.tls.prepareTLS(mapped[tls_offset..]); 70 | const arg = ""; 71 | 72 | // pub extern fn clone(func: CloneFn, stack: usize, flags: usize, arg: usize, ptid: *i32, tls: usize, ctid: *i32) 73 | // ref) https://github.com/ziglang/zig/blob/1a16b7214d88261f0e38b7ca4d15bcd76caaec4c/lib/std/os/linux/x86_64.zig#L104 74 | // ref) https://github.com/ziglang/zig/blob/1a16b7214d88261f0e38b7ca4d15bcd76caaec4c/lib/std/Thread.zig#L995-L1013 75 | const flags = linux.CLONE.NEWIPC | linux.CLONE.NEWNET | linux.CLONE.NEWUSER; 76 | var parent_tid: i32 = undefined; 77 | var child_tid: i32 = 1; 78 | switch (linux.getErrno(linux.clone( 79 | child, 80 | @ptrToInt(&mapped[stack_offset]), // stack pointer address 81 | flags, 82 | @ptrToInt(arg), // arg <- what's this??? 83 | &parent_tid, // parent_tid 84 | tls_ptr, // thread local storage ptr address 85 | &child_tid, // child_tid 86 | ))) { 87 | .SUCCESS => return, 88 | .INVAL => unreachable, 89 | .NOMEM => return error.SystemResources, 90 | .NOSPC => unreachable, 91 | .PERM => unreachable, 92 | .USERS => unreachable, 93 | else => |e| return os.unexpectedErrno(e), 94 | } 95 | 96 | print("parent"); 97 | } 98 | -------------------------------------------------------------------------------- /labo/basic/create_new_file.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const fs = std.fs; 3 | const testing = std.testing; 4 | const ArenaAllocator = std.heap.ArenaAllocator; 5 | const print = std.debug.print; 6 | 7 | pub fn main() !void { 8 | var arena = ArenaAllocator.init(std.heap.page_allocator); 9 | defer arena.deinit(); 10 | const allocator = arena.allocator(); 11 | 12 | // https://github.com/ziglang/zig/blob/6d44a6222d6eba600deb7f16c124bfa30628fb60/lib/std/fs/test.zig#L1031 13 | const base_path = blk: { // I don't know why this is needed 14 | const relative_path = try fs.path.join(allocator, &[_][]const u8{"."}); 15 | break :blk try fs.realpathAlloc(allocator, relative_path); 16 | }; 17 | defer allocator.free(base_path); 18 | 19 | const message = "Hello, world!\n"; 20 | // if you want to create another subdir, use fs.path.join 21 | // `var subdir_path = try fs.path.join(allocator, &[_][]const u8{ base_path, "hoge" });` 22 | fs.makeDirAbsolute(base_path) catch |err| switch (err) { 23 | error.PathAlreadyExists => {}, 24 | else => return err, 25 | }; 26 | try fs.accessAbsolute(base_path, .{}); 27 | var subdir = try fs.openDirAbsolute(base_path, .{}); 28 | defer subdir.close(); 29 | subdir.writeFile("test.txt", message) catch |err| switch (err) { 30 | else => return err, 31 | }; 32 | } 33 | -------------------------------------------------------------------------------- /labo/basic/forkexecv.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const os = std.os; 3 | const linux = os.linux; 4 | const print = std.debug.print; 5 | 6 | fn parent(_: usize) !void { 7 | os.exit(0); 8 | } 9 | 10 | fn child() !void { 11 | const child_args = [_:null]?[*:0]const u8{ "/bin/echo", "hello world", null }; 12 | const envp = [_:null]?[*:0]const u8{null}; 13 | os.execveZ("/bin/echo", &child_args, &envp) catch os.exit(1); 14 | } 15 | 16 | pub fn main() !void { 17 | var pid = linux.fork(); 18 | if (pid == -1) { 19 | print("fork failed\n", .{}); 20 | } else if (pid == 0) { 21 | try child(); 22 | } else { 23 | try parent(pid); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /labo/basic/hello.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const assert = std.debug.assert; 3 | const mem = std.mem; 4 | const CrossTarget = std.zig.CrossTarget; 5 | const Target = std.Target; 6 | const builtin = @import("builtin"); 7 | const process = std.process; 8 | const print = std.debug.print; 9 | 10 | comptime { 11 | assert(builtin.link_libc); 12 | if (!builtin.is_test) { 13 | @export(main, .{ .name = "main" }); 14 | } 15 | } 16 | 17 | pub fn fatal(comptime format: []const u8, args: anytype) noreturn { 18 | std.log.err(format, args); 19 | process.exit(1); 20 | } 21 | 22 | const usage = 23 | \\Usage: hello [your name] 24 | \\ 25 | ; 26 | 27 | pub fn main(argc: c_int, argv: [*][*:0]u8) callconv(.C) c_int { 28 | std.os.argv = argv[0..@intCast(usize, argc)]; 29 | 30 | std.debug.maybeEnableSegfaultHandler(); 31 | 32 | const gpa = std.heap.c_allocator; 33 | var arena_instance = std.heap.ArenaAllocator.init(gpa); 34 | defer arena_instance.deinit(); 35 | const arena = arena_instance.allocator(); 36 | 37 | const args = arena.alloc([]const u8, @intCast(usize, argc)) catch fatal("{s}", .{"OutOfMemory"}); 38 | for (args) |*arg, i| { 39 | arg.* = mem.sliceTo(argv[i], 0); 40 | } 41 | 42 | if (args.len == 2) { 43 | print("Hello, {s}\n", .{args[1]}); 44 | } else { 45 | std.log.info("{s}", .{usage}); 46 | fatal("expected command argument", .{}); 47 | } 48 | 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /labo/basic/namespace.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const os = std.os; 3 | const fs = std.fs; 4 | const fmt = std.fmt; 5 | const mem = std.mem; 6 | const heap = std.heap; 7 | const ArenaAllocator = heap.ArenaAllocator; 8 | const linux = os.linux; 9 | const print = std.debug.print; 10 | 11 | const sync_t = enum(c_int) { 12 | SYNC_USERMAP_PLS = 0x40, 13 | SYNC_USERMAP_ACK = 0x41, 14 | }; 15 | 16 | fn parent(allocator: mem.Allocator, cpid: os.pid_t, syncpipe: [2]os.fd_t) !void { 17 | var syncfd = syncpipe[0]; 18 | os.close(syncpipe[1]); 19 | 20 | print("parent pid: {}\n", .{linux.getpid()}); 21 | print("child pid: {}\n", .{cpid}); 22 | 23 | var buf: [1]u8 = undefined; 24 | if (os.read(syncfd, &buf)) |size| { 25 | print("read {} bytes from child\n", .{size}); 26 | if (size != 1) { 27 | return error.Unexpected; 28 | } 29 | } else |err| { 30 | return err; 31 | } 32 | switch (@intToEnum(sync_t, @intCast(c_int, buf[0]))) { 33 | .SYNC_USERMAP_PLS => {}, 34 | else => unreachable, 35 | } 36 | 37 | // https://man7.org/linux/man-pages/man7/user_namespaces.7.html#:~:text=User%20and%20group%20ID%20mappings%3A%20uid_map%20and%20gid_map 38 | // uid_map and gid_map are only writable from parent process. 39 | //var uid = linux.getpid(); 40 | const uid = 1000; 41 | //var gid = linux.getpid(); 42 | const gid = 1000; 43 | 44 | var string_pid = try fmt.allocPrint(allocator, "{}", .{cpid}); 45 | defer allocator.free(string_pid); 46 | var uid_map_path = try fs.path.join(allocator, &[_][]const u8{ "/proc", string_pid, "uid_map" }); 47 | defer allocator.free(uid_map_path); 48 | var gid_map_path = try fs.path.join(allocator, &[_][]const u8{ "/proc", string_pid, "gid_map" }); 49 | defer allocator.free(gid_map_path); 50 | 51 | print("uid_map_path: {s}\n", .{uid_map_path}); 52 | print("gid_map_path: {s}\n", .{gid_map_path}); 53 | 54 | var uid_map = try fs.openFileAbsolute(uid_map_path, .{ .read = true, .write = true }); 55 | defer uid_map.close(); 56 | var gid_map = try fs.openFileAbsolute(gid_map_path, .{ .read = true, .write = true }); 57 | defer gid_map.close(); 58 | 59 | var uid_map_contents = try fmt.allocPrint(allocator, "0 {} 1\n", .{uid}); 60 | defer allocator.free(uid_map_contents); 61 | var gid_map_contents = try fmt.allocPrint(allocator, "0 {} 1\n", .{gid}); 62 | defer allocator.free(gid_map_contents); 63 | 64 | try uid_map.writer().writeAll(uid_map_contents); 65 | try gid_map.writer().writeAll(gid_map_contents); 66 | 67 | var synctag: []const u8 = &[_]u8{@intCast(u8, @enumToInt(sync_t.SYNC_USERMAP_ACK))}; 68 | if (os.write(syncfd, synctag)) |size| { 69 | print("wrote {} bytes to child\n", .{size}); 70 | if (size != 1) { 71 | return error.Unexpected; 72 | } 73 | } else |err| { 74 | return err; 75 | } 76 | 77 | var result = os.waitpid(cpid, 0); // i'm not sure how to handle WaitPidResult.status with zig, there's no macro like WIFEXITED 78 | _ = result.status; 79 | } 80 | 81 | fn child(allocator: mem.Allocator, syncpipe: [2]os.fd_t) !void { 82 | var syncfd = syncpipe[1]; 83 | os.close(syncpipe[0]); 84 | 85 | _ = allocator; 86 | 87 | const flags = linux.CLONE.NEWIPC | linux.CLONE.NEWNET | linux.CLONE.NEWUSER; 88 | if (linux.unshare(flags) == -1) { 89 | print("unshare failed\n", .{}); 90 | os.exit(1); 91 | } 92 | 93 | var synctag: []const u8 = &[_]u8{@intCast(u8, @enumToInt(sync_t.SYNC_USERMAP_PLS))}; 94 | if (os.write(syncfd, synctag)) |size| { 95 | if (size != 1) { 96 | return error.Unexpected; 97 | } 98 | } else |err| { 99 | return err; 100 | } 101 | var buf: [1]u8 = undefined; 102 | if (os.read(syncfd, &buf)) |size| { 103 | if (size != 1) { 104 | return error.Unexpected; 105 | } 106 | } else |err| { 107 | return err; 108 | } 109 | switch (@intToEnum(sync_t, @intCast(c_int, buf[0]))) { 110 | .SYNC_USERMAP_ACK => {}, 111 | else => unreachable, 112 | } 113 | 114 | if (linux.setresuid(0, 0, 0) == -1) { 115 | print("setresuid failed\n", .{}); 116 | return error.Unexpected; 117 | } 118 | if (linux.setresgid(0, 0, 0) == -1) { 119 | print("setresgid failed\n", .{}); 120 | return error.Unexpected; 121 | } 122 | 123 | const child_args = [_:null]?[*:0]const u8{ "/bin/sh", null }; 124 | const envp = [_:null]?[*:0]const u8{null}; 125 | return os.execveZ("/bin/sh", &child_args, &envp); 126 | } 127 | 128 | // Use fork and unshare to create a new process with a new PID 129 | // youki: https://github.com/containers/youki/blob/619ae7d1eccbd82fd116465ed25ef410ace2a2a1/crates/libcontainer/src/process/container_main_process.rs#L206-L240 130 | pub fn main() !void { 131 | var arena = ArenaAllocator.init(std.heap.page_allocator); 132 | defer arena.deinit(); 133 | const allocator = arena.allocator(); 134 | 135 | var syncsocket: [2]os.fd_t = undefined; 136 | if (linux.socketpair(linux.AF.UNIX, linux.SOCK.STREAM, 0, syncsocket) < 0) { 137 | print("socketpair failed\n", .{}); 138 | os.exit(1); 139 | } 140 | 141 | var cpid = os.fork() catch { 142 | print("fork failed\n", .{}); 143 | os.exit(1); 144 | }; 145 | 146 | if (cpid == 0) { // child 147 | try child(allocator, syncsocket); 148 | } else { // parent 149 | try parent(allocator, cpid, syncsocket); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /labo/basic/randstr.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | // https://github.com/vrischmann/zig-prometheus/blob/46c6a1d32802976e84659dfedaaee70408850091/examples/basic/main.zig 4 | fn getRandomString(allocator: std.mem.Allocator, random: std.rand.Random, n: usize) ![]const u8 { 5 | const chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 6 | 7 | var items = try allocator.alloc(u8, n); 8 | for (items) |*item| { 9 | const random_pos = random.intRangeLessThan(usize, 0, chars.len); 10 | item.* = chars[random_pos]; 11 | } 12 | 13 | return items; 14 | } 15 | 16 | pub fn main() !void { 17 | const now = std.time.timestamp(); 18 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 19 | var allocator = arena.allocator(); 20 | var prng = std.rand.DefaultPrng.init(@intCast(u64, now)); 21 | const random = prng.random(); 22 | // i don't know how to iterate in specific counts 23 | for ([_]u0{0} ** 3) |_, i| { 24 | _ = i; 25 | const hoge = getRandomString(allocator, random, 10); 26 | std.debug.print("{s}\n", .{hoge}); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /labo/basic/return_from_switch.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const debug = std.debug; 3 | const fmt = std.fmt; 4 | 5 | const allocator = std.heap.page_allocator; 6 | 7 | fn switchstr(num: i64) []const u8 { 8 | return switch(num) { 9 | 1 => "one", 10 | 2 => "two", 11 | 3 => "three", 12 | else => |err| return fmt.allocPrint(allocator, "hogehoge {}", .{err}) catch "", 13 | }; 14 | } 15 | 16 | pub fn main() !void { 17 | debug.print("hello, world\n", .{}); 18 | debug.print("{s}\n", .{switchstr(1)}); 19 | debug.print("{s}\n", .{switchstr(2)}); 20 | debug.print("{s}\n", .{switchstr(3)}); 21 | debug.print("{s}\n", .{switchstr(4)}); 22 | } 23 | -------------------------------------------------------------------------------- /labo/basic/simpleforkunshareexecv.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const os = std.os; 3 | const linux = os.linux; 4 | const print = std.debug.print; 5 | 6 | fn parent(cpid: os.pid_t) !void { 7 | var result = os.waitpid(cpid, 0); // i'm not sure how to handle WaitPidResult.status with zig, there's no macro like WIFEXITED 8 | _ = result.status; 9 | } 10 | 11 | fn child() !void { 12 | const flags = linux.CLONE.NEWIPC | linux.CLONE.NEWNET | linux.CLONE.NEWUSER; 13 | if (linux.unshare(flags) == -1) { 14 | print("unshare failed\n", .{}); 15 | os.exit(1); 16 | } 17 | 18 | const child_args = [_:null]?[*:0]const u8{ "/bin/sh", null }; 19 | const envp = [_:null]?[*:0]const u8{null}; 20 | try os.execveZ("/bin/sh", &child_args, &envp) catch return; 21 | } 22 | 23 | // Use fork and unshare to create a new process with a new PID 24 | // youki: https://github.com/containers/youki/blob/619ae7d1eccbd82fd116465ed25ef410ace2a2a1/crates/libcontainer/src/process/container_main_process.rs#L206-L240 25 | pub fn main() !void { 26 | var pid = os.fork() catch { 27 | print("fork failed\n", .{}); 28 | os.exit(1); 29 | }; 30 | 31 | if (pid == 0) { // child 32 | try child(); 33 | } else { // parent 34 | try parent(pid); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /labo/basic/structfieldsiterate.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | // https://gist.github.com/travisstaloch/71a7a2bc260997abe06016c619b40bf2 4 | pub fn main() !void { 5 | const U1s = packed struct { 6 | a: u1, 7 | b: u1, 8 | c: u1, 9 | }; 10 | 11 | const x = U1s{ .a = 1, .b = 0, .c = 0 }; 12 | inline for (std.meta.fields(@TypeOf(x))) |f| { 13 | std.debug.print(f.name ++ " {}\n", .{@as(f.field_type, @field(x, f.name))}); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /labo/basic/tmpfile.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const rand = std.rand; 3 | const fs = std.fs; 4 | const mem = std.mem; 5 | const Allocator = mem.Allocator; 6 | 7 | pub const RandomStringError = mem.Allocator.Error; 8 | 9 | /// Generate random string 10 | pub fn randomString(allocator: Allocator, random: rand.Random, n: usize) RandomStringError![]const u8 { 11 | const chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 12 | 13 | var items = try allocator.alloc(u8, n); 14 | for (items) |*item| { 15 | const random_pos = random.intRangeLessThan(usize, 0, chars.len); 16 | item.* = chars[random_pos]; 17 | } 18 | 19 | return items; 20 | } 21 | 22 | pub fn main() !void { 23 | const now = std.time.timestamp(); 24 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 25 | var allocator = arena.allocator(); 26 | 27 | const parent_path = "/tmp"; 28 | var prng = rand.DefaultPrng.init(@intCast(u64, now)); 29 | const random = prng.random(); 30 | const max_retry = 10; 31 | const length = 10; 32 | for ([_]u0{0} ** max_retry) |_| { 33 | const file_name = try randomString(allocator, random, length); 34 | std.debug.print("createTempFile: try to create '{s}'", .{file_name}); 35 | const path = try fs.path.join(allocator, &[_][]const u8{ parent_path, file_name }); 36 | const file = fs.createFileAbsolute(path, .{}) catch |err| switch (err) { 37 | error.PathAlreadyExists => continue, 38 | else => return err, 39 | }; 40 | defer file.close(); 41 | return std.debug.print("{s}\n", .{path}); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /labo/basic/userns_child_exec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/musaprg/runzigc/6ad4fc4146750566d76b46d61324bfd44aabc98a/labo/basic/userns_child_exec -------------------------------------------------------------------------------- /labo/basic/userns_child_exec.c: -------------------------------------------------------------------------------- 1 | /* userns_child_exec.c 2 | 3 | Copyright 2013, Michael Kerrisk 4 | Licensed under GNU General Public License v2 or later 5 | 6 | Create a child process that executes a shell command in new 7 | namespace(s); allow UID and GID mappings to be specified when 8 | creating a user namespace. 9 | */ 10 | #define _GNU_SOURCE 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | /* A simple error-handling function: print an error message based 23 | on the value in 'errno' and terminate the calling process */ 24 | 25 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ 26 | } while (0) 27 | 28 | struct child_args { 29 | char **argv; /* Command to be executed by child, with arguments */ 30 | int pipe_fd[2]; /* Pipe used to synchronize parent and child */ 31 | }; 32 | 33 | static int verbose; 34 | 35 | static void 36 | usage(char *pname) 37 | { 38 | fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname); 39 | fprintf(stderr, "Create a child process that executes a shell command " 40 | "in a new user namespace,\n" 41 | "and possibly also other new namespace(s).\n\n"); 42 | fprintf(stderr, "Options can be:\n\n"); 43 | #define fpe(str) fprintf(stderr, " %s", str); 44 | fpe("-i New IPC namespace\n"); 45 | fpe("-m New mount namespace\n"); 46 | fpe("-n New network namespace\n"); 47 | fpe("-p New PID namespace\n"); 48 | fpe("-u New UTS namespace\n"); 49 | fpe("-U New user namespace\n"); 50 | fpe("-M uid_map Specify UID map for user namespace\n"); 51 | fpe("-G gid_map Specify GID map for user namespace\n"); 52 | fpe(" If -M or -G is specified, -U is required\n"); 53 | fpe("-v Display verbose messages\n"); 54 | fpe("\n"); 55 | fpe("Map strings for -M and -G consist of records of the form:\n"); 56 | fpe("\n"); 57 | fpe(" ID-inside-ns ID-outside-ns len\n"); 58 | fpe("\n"); 59 | fpe("A map string can contain multiple records, separated by commas;\n"); 60 | fpe("the commas are replaced by newlines before writing to map files.\n"); 61 | 62 | exit(EXIT_FAILURE); 63 | } 64 | 65 | /* Update the mapping file 'map_file', with the value provided in 66 | 'mapping', a string that defines a UID or GID mapping. A UID or 67 | GID mapping consists of one or more newline-delimited records 68 | of the form: 69 | 70 | ID_inside-ns ID-outside-ns length 71 | 72 | Requiring the user to supply a string that contains newlines is 73 | of course inconvenient for command-line use. Thus, we permit the 74 | use of commas to delimit records in this string, and replace them 75 | with newlines before writing the string to the file. */ 76 | 77 | static void 78 | update_map(char *mapping, char *map_file) 79 | { 80 | int fd, j; 81 | size_t map_len; /* Length of 'mapping' */ 82 | 83 | /* Replace commas in mapping string with newlines */ 84 | 85 | map_len = strlen(mapping); 86 | for (j = 0; j < map_len; j++) 87 | if (mapping[j] == ',') 88 | mapping[j] = '\n'; 89 | 90 | fd = open(map_file, O_RDWR); 91 | if (fd == -1) { 92 | fprintf(stderr, "open %s: %s\n", map_file, strerror(errno)); 93 | exit(EXIT_FAILURE); 94 | } 95 | 96 | if (write(fd, mapping, map_len) != map_len) { 97 | fprintf(stderr, "write %s: %s\n", map_file, strerror(errno)); 98 | exit(EXIT_FAILURE); 99 | } 100 | 101 | close(fd); 102 | } 103 | 104 | static int /* Start function for cloned child */ 105 | childFunc(void *arg) 106 | { 107 | struct child_args *args = (struct child_args *) arg; 108 | char ch; 109 | 110 | /* Wait until the parent has updated the UID and GID mappings. See 111 | the comment in main(). We wait for end of file on a pipe that will 112 | be closed by the parent process once it has updated the mappings. */ 113 | 114 | close(args->pipe_fd[1]); /* Close our descriptor for the write end 115 | of the pipe so that we see EOF when 116 | parent closes its descriptor */ 117 | if (read(args->pipe_fd[0], &ch, 1) != 0) { 118 | fprintf(stderr, "Failure in child: read from pipe returned != 0\n"); 119 | exit(EXIT_FAILURE); 120 | } 121 | 122 | /* Execute a shell command */ 123 | 124 | execvp(args->argv[0], args->argv); 125 | errExit("execvp"); 126 | } 127 | 128 | #define STACK_SIZE (1024 * 1024) 129 | 130 | static char child_stack[STACK_SIZE]; /* Space for child's stack */ 131 | 132 | int 133 | main(int argc, char *argv[]) 134 | { 135 | int flags, opt; 136 | pid_t child_pid; 137 | struct child_args args; 138 | char *uid_map, *gid_map; 139 | char map_path[PATH_MAX]; 140 | 141 | /* Parse command-line options. The initial '+' character in 142 | the final getopt() argument prevents GNU-style permutation 143 | of command-line options. That's useful, since sometimes 144 | the 'command' to be executed by this program itself 145 | has command-line options. We don't want getopt() to treat 146 | those as options to this program. */ 147 | 148 | flags = 0; 149 | verbose = 0; 150 | gid_map = NULL; 151 | uid_map = NULL; 152 | while ((opt = getopt(argc, argv, "+imnpuUM:G:v")) != -1) { 153 | switch (opt) { 154 | case 'i': flags |= CLONE_NEWIPC; break; 155 | case 'm': flags |= CLONE_NEWNS; break; 156 | case 'n': flags |= CLONE_NEWNET; break; 157 | case 'p': flags |= CLONE_NEWPID; break; 158 | case 'u': flags |= CLONE_NEWUTS; break; 159 | case 'v': verbose = 1; break; 160 | case 'M': uid_map = optarg; break; 161 | case 'G': gid_map = optarg; break; 162 | case 'U': flags |= CLONE_NEWUSER; break; 163 | default: usage(argv[0]); 164 | } 165 | } 166 | 167 | /* -M or -G without -U is nonsensical */ 168 | 169 | if ((uid_map != NULL || gid_map != NULL) && 170 | !(flags & CLONE_NEWUSER)) 171 | usage(argv[0]); 172 | 173 | args.argv = &argv[optind]; 174 | 175 | /* We use a pipe to synchronize the parent and child, in order to 176 | ensure that the parent sets the UID and GID maps before the child 177 | calls execve(). This ensures that the child maintains its 178 | capabilities during the execve() in the common case where we 179 | want to map the child's effective user ID to 0 in the new user 180 | namespace. Without this synchronization, the child would lose 181 | its capabilities if it performed an execve() with nonzero 182 | user IDs (see the capabilities(7) man page for details of the 183 | transformation of a process's capabilities during execve()). */ 184 | 185 | if (pipe(args.pipe_fd) == -1) 186 | errExit("pipe"); 187 | 188 | /* Create the child in new namespace(s) */ 189 | 190 | child_pid = clone(childFunc, child_stack + STACK_SIZE, 191 | flags | SIGCHLD, &args); 192 | if (child_pid == -1) 193 | errExit("clone"); 194 | 195 | /* Parent falls through to here */ 196 | 197 | if (verbose) 198 | printf("%s: PID of child created by clone() is %ld\n", 199 | argv[0], (long) child_pid); 200 | 201 | /* Update the UID and GID maps in the child */ 202 | 203 | if (uid_map != NULL) { 204 | snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map", 205 | (long) child_pid); 206 | update_map(uid_map, map_path); 207 | } 208 | if (gid_map != NULL) { 209 | snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map", 210 | (long) child_pid); 211 | update_map(gid_map, map_path); 212 | } 213 | 214 | /* Close the write end of the pipe, to signal to the child that we 215 | have updated the UID and GID maps */ 216 | 217 | close(args.pipe_fd[1]); 218 | 219 | if (waitpid(child_pid, NULL, 0) == -1) /* Wait for child */ 220 | errExit("waitpid"); 221 | 222 | if (verbose) 223 | printf("%s: terminating\n", argv[0]); 224 | 225 | exit(EXIT_SUCCESS); 226 | } 227 | 228 | -------------------------------------------------------------------------------- /labo/mount_procfs/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | $ strace -f -o ./strace.txt unshare --pid --fork --mount-proc /bin/sh 3 | # exit 4 | ``` -------------------------------------------------------------------------------- /labo/mount_procfs/strace.txt: -------------------------------------------------------------------------------- 1 | 6519 execve("/usr/bin/unshare", ["unshare", "--pid", "--fork", "--mount-proc", "/bin/sh"], 0x7fffe76cdc48 /* 16 vars */) = 0 2 | 6519 brk(NULL) = 0x564e4e4c8000 3 | 6519 arch_prctl(0x3001 /* ARCH_??? */, 0x7ffd0be32ea0) = -1 EINVAL (Invalid argument) 4 | 6519 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) 5 | 6519 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 6 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=46365, ...}) = 0 7 | 6519 mmap(NULL, 46365, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b0000 8 | 6519 close(3) = 0 9 | 6519 openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 10 | 6519 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\300A\2\0\0\0\0\0"..., 832) = 832 11 | 6519 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 12 | 6519 pread64(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32, 848) = 32 13 | 6519 pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0\30x\346\264ur\f|Q\226\236i\253-'o"..., 68, 880) = 68 14 | 6519 fstat(3, {st_mode=S_IFREG|0755, st_size=2029592, ...}) = 0 15 | 6519 mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f20d99ae000 16 | 6519 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 17 | 6519 pread64(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32, 848) = 32 18 | 6519 pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0\30x\346\264ur\f|Q\226\236i\253-'o"..., 68, 880) = 68 19 | 6519 mmap(NULL, 2037344, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f20d97bc000 20 | 6519 mmap(0x7f20d97de000, 1540096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x22000) = 0x7f20d97de000 21 | 6519 mmap(0x7f20d9956000, 319488, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19a000) = 0x7f20d9956000 22 | 6519 mmap(0x7f20d99a4000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f20d99a4000 23 | 6519 mmap(0x7f20d99aa000, 13920, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f20d99aa000 24 | 6519 close(3) = 0 25 | 6519 arch_prctl(ARCH_SET_FS, 0x7f20d99af580) = 0 26 | 6519 mprotect(0x7f20d99a4000, 16384, PROT_READ) = 0 27 | 6519 mprotect(0x564e4def6000, 4096, PROT_READ) = 0 28 | 6519 mprotect(0x7f20d99e9000, 4096, PROT_READ) = 0 29 | 6519 munmap(0x7f20d99b0000, 46365) = 0 30 | 6519 geteuid() = 0 31 | 6519 getegid() = 0 32 | 6519 brk(NULL) = 0x564e4e4c8000 33 | 6519 brk(0x564e4e4e9000) = 0x564e4e4e9000 34 | 6519 openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3 35 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=3035952, ...}) = 0 36 | 6519 mmap(NULL, 3035952, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d94d6000 37 | 6519 close(3) = 0 38 | 6519 openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3 39 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=2996, ...}) = 0 40 | 6519 read(3, "# Locale name alias data base.\n#"..., 4096) = 2996 41 | 6519 read(3, "", 4096) = 0 42 | 6519 close(3) = 0 43 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_IDENTIFICATION", O_RDONLY|O_CLOEXEC) = 3 44 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=252, ...}) = 0 45 | 6519 mmap(NULL, 252, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99e8000 46 | 6519 close(3) = 0 47 | 6519 openat(AT_FDCWD, "/usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache", O_RDONLY) = 3 48 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=27002, ...}) = 0 49 | 6519 mmap(NULL, 27002, PROT_READ, MAP_SHARED, 3, 0) = 0x7f20d99b5000 50 | 6519 close(3) = 0 51 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_MEASUREMENT", O_RDONLY|O_CLOEXEC) = 3 52 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=23, ...}) = 0 53 | 6519 mmap(NULL, 23, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b4000 54 | 6519 close(3) = 0 55 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_TELEPHONE", O_RDONLY|O_CLOEXEC) = 3 56 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=47, ...}) = 0 57 | 6519 mmap(NULL, 47, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b3000 58 | 6519 close(3) = 0 59 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_ADDRESS", O_RDONLY|O_CLOEXEC) = 3 60 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=131, ...}) = 0 61 | 6519 mmap(NULL, 131, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b2000 62 | 6519 close(3) = 0 63 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_NAME", O_RDONLY|O_CLOEXEC) = 3 64 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=62, ...}) = 0 65 | 6519 mmap(NULL, 62, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b1000 66 | 6519 close(3) = 0 67 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_PAPER", O_RDONLY|O_CLOEXEC) = 3 68 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=34, ...}) = 0 69 | 6519 mmap(NULL, 34, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d99b0000 70 | 6519 close(3) = 0 71 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_MESSAGES", O_RDONLY|O_CLOEXEC) = 3 72 | 6519 fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0 73 | 6519 close(3) = 0 74 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_MESSAGES/SYS_LC_MESSAGES", O_RDONLY|O_CLOEXEC) = 3 75 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=48, ...}) = 0 76 | 6519 mmap(NULL, 48, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d94d5000 77 | 6519 close(3) = 0 78 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_MONETARY", O_RDONLY|O_CLOEXEC) = 3 79 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=270, ...}) = 0 80 | 6519 mmap(NULL, 270, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d94d4000 81 | 6519 close(3) = 0 82 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_COLLATE", O_RDONLY|O_CLOEXEC) = 3 83 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=1518110, ...}) = 0 84 | 6519 mmap(NULL, 1518110, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d9361000 85 | 6519 close(3) = 0 86 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_TIME", O_RDONLY|O_CLOEXEC) = 3 87 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=3360, ...}) = 0 88 | 6519 mmap(NULL, 3360, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d9360000 89 | 6519 close(3) = 0 90 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_NUMERIC", O_RDONLY|O_CLOEXEC) = 3 91 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=50, ...}) = 0 92 | 6519 mmap(NULL, 50, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d935f000 93 | 6519 close(3) = 0 94 | 6519 openat(AT_FDCWD, "/usr/lib/locale/C.UTF-8/LC_CTYPE", O_RDONLY|O_CLOEXEC) = 3 95 | 6519 fstat(3, {st_mode=S_IFREG|0644, st_size=201272, ...}) = 0 96 | 6519 mmap(NULL, 201272, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f20d932d000 97 | 6519 close(3) = 0 98 | 6519 unshare(CLONE_NEWNS|CLONE_NEWPID) = 0 99 | 6519 clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f20d99af850) = 6520 100 | 6520 mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL 101 | 6519 wait4(6520, 102 | 6520 <... mount resumed>) = 0 103 | 6520 mount("none", "/proc", NULL, MS_REC|MS_PRIVATE, NULL) = 0 104 | 6520 mount("proc", "/proc", "proc", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL) = 0 105 | 6520 execve("/bin/sh", ["/bin/sh"], 0x7ffd0be32fa8 /* 16 vars */) = 0 106 | 6520 brk(NULL) = 0x558387f0c000 107 | 6520 arch_prctl(0x3001 /* ARCH_??? */, 0x7ffecbf4c8d0) = -1 EINVAL (Invalid argument) 108 | 6520 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) 109 | 6520 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 110 | 6520 fstat(3, {st_mode=S_IFREG|0644, st_size=46365, ...}) = 0 111 | 6520 mmap(NULL, 46365, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fe8cab69000 112 | 6520 close(3) = 0 113 | 6520 openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 114 | 6520 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\300A\2\0\0\0\0\0"..., 832) = 832 115 | 6520 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 116 | 6520 pread64(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32, 848) = 32 117 | 6520 pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0\30x\346\264ur\f|Q\226\236i\253-'o"..., 68, 880) = 68 118 | 6520 fstat(3, {st_mode=S_IFREG|0755, st_size=2029592, ...}) = 0 119 | 6520 mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fe8cab67000 120 | 6520 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 121 | 6520 pread64(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32, 848) = 32 122 | 6520 pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0\30x\346\264ur\f|Q\226\236i\253-'o"..., 68, 880) = 68 123 | 6520 mmap(NULL, 2037344, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fe8ca975000 124 | 6520 mmap(0x7fe8ca997000, 1540096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x22000) = 0x7fe8ca997000 125 | 6520 mmap(0x7fe8cab0f000, 319488, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19a000) = 0x7fe8cab0f000 126 | 6520 mmap(0x7fe8cab5d000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7fe8cab5d000 127 | 6520 mmap(0x7fe8cab63000, 13920, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fe8cab63000 128 | 6520 close(3) = 0 129 | 6520 arch_prctl(ARCH_SET_FS, 0x7fe8cab68580) = 0 130 | 6520 mprotect(0x7fe8cab5d000, 16384, PROT_READ) = 0 131 | 6520 mprotect(0x55838686d000, 8192, PROT_READ) = 0 132 | 6520 mprotect(0x7fe8caba2000, 4096, PROT_READ) = 0 133 | 6520 munmap(0x7fe8cab69000, 46365) = 0 134 | 6520 getuid() = 0 135 | 6520 getgid() = 0 136 | 6520 getpid() = 1 137 | 6520 rt_sigaction(SIGCHLD, {sa_handler=0x558386862c30, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 138 | 6520 geteuid() = 0 139 | 6520 brk(NULL) = 0x558387f0c000 140 | 6520 brk(0x558387f2d000) = 0x558387f2d000 141 | 6520 getppid() = 0 142 | 6520 stat("/home/mssn/workspace/personal/runzigc", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 143 | 6520 stat(".", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 144 | 6520 ioctl(0, TCGETS, {B38400 opost isig icanon echo ...}) = 0 145 | 6520 ioctl(1, TCGETS, {B38400 opost isig icanon echo ...}) = 0 146 | 6520 geteuid() = 0 147 | 6520 getegid() = 0 148 | 6520 rt_sigaction(SIGINT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 149 | 6520 rt_sigaction(SIGINT, {sa_handler=0x558386862c30, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 150 | 6520 rt_sigaction(SIGQUIT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 151 | 6520 rt_sigaction(SIGQUIT, {sa_handler=SIG_IGN, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 152 | 6520 rt_sigaction(SIGTERM, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 153 | 6520 rt_sigaction(SIGTERM, {sa_handler=SIG_IGN, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 154 | 6520 openat(AT_FDCWD, "/dev/tty", O_RDWR) = 3 155 | 6520 fcntl(3, F_DUPFD, 10) = 10 156 | 6520 close(3) = 0 157 | 6520 fcntl(10, F_SETFD, FD_CLOEXEC) = 0 158 | 6520 ioctl(10, TIOCGPGRP, [0]) = 0 159 | 6520 getpgrp() = 0 160 | 6520 rt_sigaction(SIGTSTP, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 161 | 6520 rt_sigaction(SIGTSTP, {sa_handler=SIG_IGN, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 162 | 6520 rt_sigaction(SIGTTOU, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 163 | 6520 rt_sigaction(SIGTTOU, {sa_handler=SIG_IGN, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 164 | 6520 rt_sigaction(SIGTTIN, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 165 | 6520 rt_sigaction(SIGTTIN, {sa_handler=SIG_DFL, sa_mask=~[RTMIN RT_1], sa_flags=SA_RESTORER, sa_restorer=0x7fe8ca9b8090}, NULL, 8) = 0 166 | 6520 setpgid(0, 1) = 0 167 | 6520 ioctl(10, TIOCSPGRP, [1]) = 0 168 | 6520 wait4(-1, 0x7ffecbf4c57c, WNOHANG|WSTOPPED, NULL) = -1 ECHILD (No child processes) 169 | 6520 stat("/var/mail/root", 0x7ffecbf4c650) = -1 ENOENT (No such file or directory) 170 | 6520 write(2, "# ", 2) = 2 171 | 6520 read(0, "exit\n", 8192) = 5 172 | 6520 ioctl(10, TIOCSPGRP, [0]) = -1 ESRCH (No such process) 173 | 6520 write(2, "/bin/sh: 1: ", 12) = 12 174 | 6520 write(2, "Cannot set tty process group (No"..., 46) = 46 175 | 6520 write(2, "\n", 1) = 1 176 | 6520 exit_group(0) = ? 177 | 6520 +++ exited with 0 +++ 178 | 6519 <... wait4 resumed>[{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 6520 179 | 6519 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=6520, si_uid=0, si_status=0, si_utime=0, si_stime=1} --- 180 | 6519 close(1) = 0 181 | 6519 close(2) = 0 182 | 6519 exit_group(0) = ? 183 | 6519 +++ exited with 0 +++ 184 | -------------------------------------------------------------------------------- /labo/multipass/README.md: -------------------------------------------------------------------------------- 1 | # multipass 2 | 3 | development environment built with multipass -------------------------------------------------------------------------------- /labo/multipass/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | GITROOT_DIR=$(git rev-parse --show-toplevel) 6 | SCRIPT_DIR=$(cd $(dirname $0); pwd) 7 | CLOUDCONF_PATH=${SCRIPT_DIR}/cloud-config.yaml 8 | ZLS_VERSION=0.9.0 9 | VM_NAME="multipass" 10 | 11 | if [ ! -e ${CLOUDCONF_PATH} ]; then 12 | if [ ! -e ~/.ssh/multipass ]; then 13 | ssh-keygen -t rsa -b 4096 -C "$(uuidgen)" -f ~/.ssh/multipass 14 | cat >> ~/.ssh/config << _EOF_ 15 | Host ${VM_NAME}.local 16 | HostName ${VM_NAME}.local 17 | IdentityFile ~/.ssh/multipass 18 | User ubuntu 19 | Port 22 20 | _EOF_ 21 | fi 22 | 23 | AUTHORIZED_KEYS=$(cat ~/.ssh/multipass.pub) 24 | cat > ${CLOUDCONF_PATH} << _EOF_ 25 | #cloud-config 26 | 27 | locale: en_US.UTF8 28 | timezone: Asia/Tokyo 29 | 30 | users: 31 | - name: ubuntu 32 | sudo: ALL=(ALL) NOPASSWD:ALL 33 | ssh-authorized-keys: 34 | - ${AUTHORIZED_KEYS} 35 | 36 | package_upgrade: true 37 | 38 | packages: 39 | - avahi-daemon 40 | - apt-transport-https 41 | - ca-certificates 42 | - curl 43 | - gnupg 44 | - lsb-release 45 | - zip 46 | - jq 47 | - python3-pip 48 | - xz-utils 49 | 50 | runcmd: 51 | - sudo snap install --classic --beta zig 52 | - |- 53 | git clone https://github.com/zigtools/zls.git && \ 54 | cd zls && git checkout refs/tags/${ZLS_VERSION} && \ 55 | git submodule update --init --recursive && \ 56 | zig build -Drelease-safe 57 | _EOF_ 58 | fi 59 | 60 | multipass delete ${VM_NAME} 2>/dev/null || : 61 | multipass purge 62 | multipass launch \ 63 | --name ${VM_NAME} \ 64 | --cpus 2 \ 65 | --mem 4G \ 66 | --disk 20G \ 67 | --cloud-init ${CLOUDCONF_PATH} \ 68 | --mount ${GITROOT_DIR}:/src \ 69 | 20.04 -------------------------------------------------------------------------------- /prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | 5 | arch=$(uname -m) 6 | 7 | mkdir -m 755 -p /root/rootfs/proc 8 | mkdir -m 755 -p /root/rootfs/bin 9 | mkdir -m 755 -p /root/rootfs/lib 10 | 11 | cp -Lr /bin/* /root/rootfs/bin 12 | cp -Lr /usr/bin/* /root/rootfs/bin 13 | 14 | cp -Lr /lib/${arch}-linux-gnu /root/rootfs/lib 15 | cp -Lr /lib/ld-linux-${arch}.so* /root/rootfs/lib 16 | cp -Lr /lib64/ld-linux-${arch}.so* /root/rootfs/lib 17 | 18 | cd /root/rootfs/ 19 | ln -s lib lib64 20 | -------------------------------------------------------------------------------- /src/CgroupsManager.zig: -------------------------------------------------------------------------------- 1 | const CgroupsManager = @This(); 2 | 3 | const std = @import("std"); 4 | const testing = std.testing; 5 | const mem = std.mem; 6 | const meta = std.meta; 7 | const fmt = std.fmt; 8 | const fs = std.fs; 9 | const os = std.os; 10 | const Allocator = mem.Allocator; 11 | 12 | const util = @import("util.zig"); 13 | 14 | const cgroup_procs_file_name = "cgroups.procs"; 15 | 16 | // Currently, runzigc only supports cgroup v1. 17 | 18 | // TODO(musaprg): refactor this file more generic, avoid dirty implementation 19 | 20 | const CgroupPaths = struct { 21 | // TODO(musaprg): enable all subsystem 22 | // blkio: []const u8 = "", 23 | cpu: []const u8, 24 | cpuacct: []const u8, 25 | // cpuset: []const u8, 26 | devices: []const u8, 27 | freezer: []const u8, 28 | // hugetlb: []const u8, 29 | memory: []const u8, 30 | // net_cls: []const u8, 31 | // net_prio: []const u8, 32 | // perf_event: []const u8, 33 | //pids: []const u8, 34 | //rdma: []const u8, 35 | }; 36 | 37 | allocator: Allocator, 38 | cgroup_paths: CgroupPaths, 39 | 40 | pub fn new(allocator: Allocator, container_id: []const u8) !CgroupsManager { 41 | return CgroupsManager{ 42 | .allocator = allocator, 43 | // TODO(musaprg): Cgroup v2 44 | .cgroup_paths = CgroupPaths{ 45 | // TODO(musaprg): enable all subsystem 46 | .blkio = try generate_cgroups_path(allocator, "blkio", container_id), 47 | .cpu = try generate_cgroups_path(allocator, "cpu", container_id), 48 | .cpuacct = try generate_cgroups_path(allocator, "cpuacct", container_id), 49 | //.cpuset = try generate_cgroups_path(allocator, "cpuset", container_id), 50 | .devices = try generate_cgroups_path(allocator, "devices", container_id), 51 | .freezer = try generate_cgroups_path(allocator, "freezer", container_id), 52 | //.hugetlb = try generate_cgroups_path(allocator, "hugetlb", container_id), 53 | .memory = try generate_cgroups_path(allocator, "memory", container_id), 54 | // .net_cls = try generate_cgroups_path(allocator, "net_cls", container_id), 55 | // .net_prio = try generate_cgroups_path(allocator, "net_prio", container_id), 56 | // .perf_event = try generate_cgroups_path(allocator, "perf_event", container_id), 57 | // .pids = try generate_cgroups_path(allocator, "pids", container_id), 58 | // .rdma = try generate_cgroups_path(allocator, "rdma", container_id), 59 | }, 60 | }; 61 | } 62 | 63 | pub fn join(self: *CgroupsManager, pid: os.pid_t) !void { 64 | inline for (std.meta.fields(@TypeOf(self.cgroup_paths))) |f| { 65 | var subsystem_path = @as(f.field_type, @field(x, f.name)); 66 | try util.mkdirAll(subsystem_path, 0o755); 67 | const cgroup_procs_path = try fs.path.join(self.allocator, &[_][]const u8{ subsystem_path, cgroup_procs_file_name }); 68 | const cgroup_procs = try fs.cwd().openFile(cgroup_cpu_tasks_path, .{ .write = true }); 69 | defer cgroup_procs.close(); 70 | const cgroup_procs_content = try fmt.allocPrint(allocator, "{}\n", .{pid}); 71 | defer allocator.free(cgroup_procs_content); 72 | try cgroup_procs.writer().writeAll(cgroup_procs_content); 73 | } 74 | } 75 | 76 | // Do freeze with freezer subsystem 77 | pub fn freeze(self: *CgroupsManager) !void { 78 | // FIXME(musaprg): Implement me 79 | } 80 | 81 | // Remove cgroup subsystem resource 82 | pub fn destroy(self: *CgroupsManager) !void { 83 | // TODO(musaprg): backoff retry 84 | // FIXME(musaprg): Implement me 85 | } 86 | 87 | pub fn deinit(self: *CgroupsManager) void { 88 | // TODO(musaprg): consider more clever way 89 | // TODO(musaprg): enable all subsystem 90 | // self.allocator.free(self.cgroup_paths.blkio); 91 | self.allocator.free(self.cgroup_paths.cpu); 92 | self.allocator.free(self.cgroup_paths.cpuacct); 93 | //self.allocator.free(self.cgroup_paths.cpuset); 94 | self.allocator.free(self.cgroup_paths.devices); 95 | self.allocator.free(self.cgroup_paths.freezer); 96 | // self.allocator.free(self.cgroup_paths.hugetlb); 97 | self.allocator.free(self.cgroup_paths.memory); 98 | // self.allocator.free(self.cgroup_paths.net_cls); 99 | // self.allocator.free(self.cgroup_paths.net_prio); 100 | // self.allocator.free(self.cgroup_paths.perf_event); 101 | // self.allocator.free(self.cgroup_paths.pids); 102 | // self.allocator.free(self.cgroup_paths.rdma); 103 | } 104 | 105 | fn generate_cgroups_path(allocator: Allocator, name: []const u8, container_id: []const u8) ![]const u8 { 106 | return try fs.path.join(allocator, &[_][]const u8{ "/sys/fs/cgroup", name, "runzigc", container_id }); 107 | } 108 | 109 | test "new" { 110 | var arena = std.heap.ArenaAllocator.init(testing.allocator); 111 | defer arena.deinit(); 112 | var allocator = arena.allocator(); 113 | const cgroups_manager = try CgroupsManager.new(allocator, "hoge"); 114 | try testing.expect(mem.eql(u8, "/sys/fs/cgroup/blkio/runzigc/hoge", cgroups_manager.cgroup_paths.blkio)); 115 | // TODO(musaprg): write for each field case 116 | } 117 | -------------------------------------------------------------------------------- /src/OciSpec.zig: -------------------------------------------------------------------------------- 1 | const OciSpec = @This(); 2 | 3 | const std = @import("std"); 4 | const testing = std.testing; 5 | const mem = std.mem; 6 | const meta = std.meta; 7 | const Allocator = mem.Allocator; 8 | 9 | pub const RuntimeSpec = struct { 10 | ociVersion: []const u8, 11 | root: Root, 12 | mounts: []Mount = &[_]Mount{}, 13 | // TODO(musaprg): implement POSIX-platform hooks https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks 14 | linux: Linux, 15 | }; 16 | 17 | // TODO(musaprg): take care about optional 18 | const Root = struct { 19 | path: []const u8, 20 | readonly: bool, 21 | }; 22 | 23 | const Mount = struct { 24 | destination: []const u8, 25 | source: []const u8, 26 | options: []const []const u8 = &[_][]const u8{}, 27 | type: []const u8, 28 | }; 29 | 30 | const Linux = struct { 31 | namespaces: []Namespace, 32 | uidMappings: []UidMapping, 33 | devices: []Device, 34 | cgroupsPath: []const u8, 35 | }; 36 | 37 | const Namespace = struct { 38 | type: []const u8, 39 | path: ?[]const u8 = null, 40 | }; 41 | 42 | const UidMapping = struct { 43 | containerID: u32, 44 | hostID: u32, 45 | size: u32, 46 | }; 47 | 48 | const Device = struct { 49 | type: []const u8, 50 | path: []const u8, 51 | major: i64, 52 | minor: i64, 53 | fileMode: u32, 54 | uid: u32, 55 | gid: u32, 56 | }; 57 | 58 | const CgroupResource = struct { 59 | cpu: Cpu, 60 | memory: Memory, 61 | devices: []AllowedDevice, 62 | }; 63 | 64 | const Cpu = struct { 65 | shares: u64, 66 | quota: i64, 67 | period: i64, 68 | realtimeRuntime: i64, 69 | realtimePeriod: i64, 70 | cpus: []const u8, 71 | mems: []const u8, 72 | idle: i64, 73 | }; 74 | 75 | const Memory = struct { 76 | limit: i64, 77 | reservation: i64, 78 | swap: i64, 79 | kernel: i64, 80 | kernelTCP: i64, 81 | }; 82 | 83 | const AllowedDevice = struct { 84 | allow: bool, 85 | type: []const u8, 86 | major: i64, 87 | minor: i64, 88 | access: []const u8, 89 | }; 90 | 91 | const BlockIo = struct { 92 | weight: u16, 93 | leafWeight: u16, 94 | weightDevice: []WeightDevice, 95 | throttleReadBpsDevice: []DeviceRateLimit, 96 | throttleWriteBpsDevice: []DeviceRateLimit, 97 | throttleReadIOPSDevice: []DeviceRateLimit, 98 | throttleWriteIOPSDevice: []DeviceRateLimit, 99 | }; 100 | 101 | const WeightDevice = struct { 102 | major: i64, 103 | minor: i64, 104 | weight: u16, 105 | leafWeight: u16, 106 | }; 107 | 108 | const DeviceRateLimit = struct { 109 | major: i64, 110 | minor: i64, 111 | rate: u64, 112 | }; 113 | 114 | const HugepageLimit = struct { 115 | pageSize: []const u8, 116 | limit: u64, 117 | }; 118 | 119 | const Network = struct { 120 | classID: u32, 121 | priorities: []NetworkPriority, 122 | }; 123 | 124 | const NetworkPriority = struct { 125 | name: []const u8, 126 | priority: u32, 127 | }; 128 | 129 | allocator: Allocator, 130 | parse_options: std.json.ParseOptions, 131 | spec: RuntimeSpec, 132 | 133 | pub fn new(allocator: Allocator, path: []const u8) !OciSpec { 134 | const file = try std.fs.cwd().openFile(path, .{}); 135 | defer file.close(); 136 | const file_size = try file.getEndPos(); 137 | var reader = std.io.bufferedReader(file.reader()); 138 | var istream = reader.reader(); 139 | const contents = try istream.readAllAlloc(allocator, file_size); 140 | defer allocator.free(contents); 141 | 142 | const options = std.json.ParseOptions{ 143 | .allocator = allocator, 144 | // TODO(musaprg): change this to false finally to validate schema 145 | .ignore_unknown_fields = true, 146 | .allow_trailing_data = true, 147 | }; 148 | 149 | // TODO(musaprg): separate RuntimeSpec definition from this file. 150 | const spec = try std.json.parse(RuntimeSpec, &std.json.TokenStream.init(contents), options); 151 | return OciSpec{ 152 | .allocator = allocator, 153 | .spec = spec, 154 | .parse_options = options, 155 | }; 156 | } 157 | 158 | pub fn deinit(self: *OciSpec) void { 159 | std.json.parseFree(RuntimeSpec, self.spec, self.parse_options); 160 | } 161 | 162 | test "sample spec for linux" { 163 | const allocator = testing.allocator; 164 | var config = try OciSpec.new(allocator, "./testdata/sample_spec_linux.json"); 165 | var spec = config.spec; 166 | defer config.deinit(); 167 | // TODO(musaprg): implement direct struct comparison 168 | try testing.expect(mem.eql(u8, spec.ociVersion, "1.0.1")); 169 | try testing.expect(mem.eql(u8, spec.root.path, "rootfs")); 170 | try testing.expectEqual(spec.root.readonly, true); 171 | // TODO(musaprg): check mounts' contents 172 | try testing.expectEqual(spec.mounts.len, 7); 173 | } 174 | -------------------------------------------------------------------------------- /src/StateManager.zig: -------------------------------------------------------------------------------- 1 | const StateManager = @This(); 2 | 3 | // subset of libcontainer's state 4 | // ref(BaseState): https://github.com/opencontainers/runc/blob/v1.1.4/libcontainer/container.go 5 | // ref(linux-specific State): https://github.com/opencontainers/runc/blob/5fd4c4d144137e991c4acebb2146ab1483a97925/libcontainer/container_linux.go#L58-L85 6 | 7 | const std = @import("std"); 8 | const os = std.os; 9 | const testing = std.testing; 10 | const mem = std.mem; 11 | const meta = std.meta; 12 | const fs = std.fs; 13 | const Allocator = mem.Allocator; 14 | 15 | const util = @import("util.zig"); 16 | const OciSpec = @import("OciSpec.zig"); 17 | 18 | const state_file_name = "state.json"; 19 | 20 | pub const State = struct { 21 | /// Container ID 22 | id: []const u8, 23 | /// PID of init process 24 | init_process_pid: os.pid_t, 25 | init_process_start: u64, 26 | /// Created timestamp in ISO6801 27 | // TODO(musaprg): fix to marshal/unmarshal to zig's time object 28 | created: []const u8, 29 | // TODO(musaprg): add cgroup_paths 30 | // TODO(musaprg): add namespace_paths 31 | }; 32 | 33 | const CgroupPaths = struct { 34 | blkio: []const u8, 35 | cpu: []const u8, 36 | cpuacct: []const u8, 37 | cpuset: []const u8, 38 | devices: []const u8, 39 | freezer: []const u8, 40 | hugetlb: []const u8, 41 | memory: []const u8, 42 | net_cls: []const u8, 43 | net_prio: []const u8, 44 | perf_event: []const u8, 45 | pids: []const u8, 46 | rdma: []const u8, 47 | }; 48 | 49 | const NamespacePaths = struct { 50 | NEWCGROUP: []const u8, 51 | NEWIPC: []const u8, 52 | NEWNET: []const u8, 53 | NEWNS: []const u8, 54 | NEWUSER: []const u8, 55 | NEWUTS: []const u8, 56 | }; 57 | 58 | allocator: Allocator, 59 | parse_options: std.json.ParseOptions, 60 | path: []const u8, 61 | 62 | pub fn new(allocator: Allocator, root_path: []const u8) !StateManager { 63 | const options = std.json.ParseOptions{ 64 | .allocator = allocator, 65 | // TODO(musaprg): change this to false finally to validate schema 66 | .ignore_unknown_fields = true, 67 | .allow_trailing_data = true, 68 | }; 69 | const path = try fs.path.join(allocator, &[_][]const u8{ root_path, state_file_name }); 70 | return StateManager{ 71 | .allocator = allocator, 72 | .parse_options = options, 73 | .path = path, 74 | }; 75 | } 76 | 77 | pub fn write( 78 | self: *const StateManager, 79 | state: State, 80 | ) !void { 81 | const path = try util.createTempFile(self.allocator, ""); 82 | defer self.allocator.free(path); 83 | const state_file = try fs.openFileAbsolute(path, .{ .write = true }); 84 | defer state_file.close(); 85 | try std.json.stringify(state, .{}, state_file.writer()); 86 | try fs.renameAbsolute(path, self.path); 87 | } 88 | 89 | pub fn read( 90 | self: *const StateManager, 91 | ) !State { 92 | const file = try std.fs.cwd().openFile(self.path, .{}); 93 | defer file.close(); 94 | const file_size = try file.getEndPos(); 95 | var reader = std.io.bufferedReader(file.reader()); 96 | var istream = reader.reader(); 97 | const contents = try istream.readAllAlloc(self.allocator, file_size); 98 | defer self.allocator.free(contents); 99 | 100 | const state = try std.json.parse(State, &std.json.TokenStream.init(contents), self.parse_options); 101 | return state; 102 | } 103 | 104 | pub fn deinit(self: *const StateManager, state: State) void { 105 | std.json.parseFree(State, state, self.parse_options); 106 | } 107 | 108 | // TODO(musaprg): write test 109 | test "read" { 110 | var arena = std.heap.ArenaAllocator.init(testing.allocator); 111 | defer arena.deinit(); 112 | var allocator = arena.allocator(); 113 | 114 | const path = "./testdata"; 115 | const container_state = try StateManager.new(allocator, path); 116 | const state = try container_state.read(); 117 | try testing.expect(mem.eql(u8, "bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", state.id)); 118 | try testing.expectEqual(@intCast(os.pid_t, 393), state.init_process_pid); 119 | try testing.expectEqual(@intCast(u64, 3164), state.init_process_start); 120 | try testing.expect(mem.eql(u8, "2022-09-18T06:36:31.3214015Z", state.created)); 121 | } 122 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | const os = std.os; 4 | const fs = std.fs; 5 | const fmt = std.fmt; 6 | const mem = std.mem; 7 | const heap = std.heap; 8 | const linux = os.linux; 9 | const log = std.log; 10 | const debug = std.debug; 11 | 12 | const Allocator = mem.Allocator; 13 | 14 | const syscall = @import("syscall.zig"); 15 | const util = @import("util.zig"); 16 | 17 | const zig_arg = @import("zig-arg"); 18 | const flag = zig_arg.flag; 19 | const Yazap = zig_arg.Yazap; 20 | 21 | comptime { 22 | // Mark all imported packages as test targets 23 | std.testing.refAllDecls(@This()); 24 | } 25 | 26 | const native_arch = builtin.cpu.arch; 27 | 28 | const sync_t = enum(c_int) { 29 | SYNC_USERMAP_PLS = 0x40, 30 | SYNC_USERMAP_ACK = 0x41, 31 | }; 32 | 33 | const default_root_path = "/var/run/runzigc"; 34 | 35 | // set hostname and exec passed command 36 | fn init(allocator: mem.Allocator, container_id: []const u8) !void { 37 | _ = allocator; 38 | 39 | var status: usize = undefined; 40 | 41 | try syscall.setsid(); 42 | try os.setuid(0); 43 | try os.setgid(0); 44 | 45 | const hostname = "test"; 46 | syscall.sethostname(hostname) catch |err| { 47 | log.debug("sethostname failed\n", .{}); 48 | return err; 49 | }; 50 | 51 | log.debug("GRANDCHILD: current uid: {}\n", .{linux.getuid()}); 52 | log.debug("GRANDCHILD: current gid: {}\n", .{linux.getgid()}); 53 | 54 | const cgroup_cpu_path = fs.path.join(allocator, &[_][]const u8{ "/sys/fs/cgroup/cpu/runzigc", container_id }) catch |err| { 55 | log.debug("failed to join cgroup path: {}\n", .{err}); 56 | return err; 57 | }; 58 | 59 | util.mkdirAll(cgroup_cpu_path, 0700) catch |err| { 60 | switch (err) { 61 | error.PathAlreadyExists => {}, 62 | else => { 63 | log.debug("mkdir failed: {}\n", .{err}); 64 | return err; 65 | }, 66 | } 67 | }; 68 | 69 | const cgroup_cpu_tasks_path = fs.path.join(allocator, &[_][]const u8{ cgroup_cpu_path, "tasks" }) catch |err| { 70 | log.debug("failed to join cgroup path: {}\n", .{err}); 71 | return err; 72 | }; 73 | 74 | const cgroup_cpu = try fs.openFileAbsolute(cgroup_cpu_tasks_path, .{ .write = true }); 75 | defer cgroup_cpu.close(); 76 | 77 | const cgroup_cpu_content = try fmt.allocPrint(allocator, "{}\n", .{linux.getpid()}); 78 | defer allocator.free(cgroup_cpu_content); 79 | 80 | try cgroup_cpu.writer().writeAll(cgroup_cpu_content); 81 | 82 | const cgroup_cpu_quota_path = fs.path.join(allocator, &[_][]const u8{ cgroup_cpu_path, "cpu.cfs_quota_us" }) catch |err| { 83 | log.debug("failed to join cgroup path: {}\n", .{err}); 84 | return err; 85 | }; 86 | const cgroup_cpu_quota = try fs.openFileAbsolute(cgroup_cpu_quota_path, .{ .write = true }); 87 | defer cgroup_cpu_quota.close(); 88 | 89 | const cgroup_cpu_quota_content = try fmt.allocPrint(allocator, "{}\n", .{1000}); 90 | defer allocator.free(cgroup_cpu_quota_content); 91 | 92 | try cgroup_cpu_quota.writer().writeAll(cgroup_cpu_quota_content); 93 | 94 | try syscall.mount("proc", "/root/rootfs/proc", "proc", @enumToInt(syscall.MountFlags.MS_NOEXEC) | @enumToInt(syscall.MountFlags.MS_NOSUID) | @enumToInt(syscall.MountFlags.MS_NODEV), @ptrToInt("")); 95 | 96 | status = linux.chdir("/root"); 97 | switch (os.errno(status)) { 98 | .SUCCESS => {}, 99 | .ACCES => return error.AccessDenied, 100 | .PERM => return error.OperationNotPermitted, 101 | .BUSY => return error.Busy, 102 | .NOTDIR => return error.NotDir, 103 | .INVAL => return error.Invalid, 104 | else => |err| { 105 | // TODO(musaprg): define error type 106 | return os.unexpectedErrno(err); 107 | }, 108 | } 109 | 110 | try syscall.mount("rootfs", "/root/rootfs", "", @enumToInt(syscall.MountFlags.MS_BIND) | @enumToInt(syscall.MountFlags.MS_REC), @ptrToInt("")); 111 | 112 | util.mkdirAll("/root/rootfs/oldrootfs", 0700) catch |err| { 113 | log.debug("makeDirAbsolute failed\n", .{}); 114 | return err; 115 | }; 116 | 117 | // TODO(musaprg): fix here, currently we need to create /root/pivotroot/proc before executing this 118 | syscall.pivot_root("rootfs", "/root/rootfs/oldrootfs") catch |err| { 119 | log.debug("pivot_root failed\n", .{}); 120 | return err; 121 | }; 122 | 123 | try syscall.umount("/oldrootfs", @enumToInt(syscall.UmountFlags.MNT_DETACH)); 124 | 125 | fs.deleteDirAbsolute("/oldrootfs") catch |err| { 126 | log.debug("deleteDirAbsolute failed\n", .{}); 127 | return err; 128 | }; 129 | 130 | status = linux.chdir("/"); 131 | switch (os.errno(status)) { 132 | .SUCCESS => {}, 133 | .ACCES => return error.AccessDenied, 134 | .PERM => return error.OperationNotPermitted, 135 | .BUSY => return error.Busy, 136 | .NOTDIR => return error.NotDir, 137 | .INVAL => return error.Invalid, 138 | else => |err| { 139 | // TODO(musaprg): define error type 140 | return os.unexpectedErrno(err); 141 | }, 142 | } 143 | 144 | // unshare cgroups namespace 145 | if (linux.unshare(linux.CLONE.NEWCGROUP) == -1) { 146 | log.debug("unshare failed\n", .{}); 147 | os.exit(1); 148 | } 149 | 150 | const child_args = [_:null]?[*:0]const u8{ "/bin/sh", null }; 151 | const envp = [_:null]?[*:0]const u8{null}; 152 | return os.execveZ("/bin/sh", &child_args, &envp); 153 | } 154 | 155 | // fork and unshare and exec init 156 | fn run(allocator: mem.Allocator) !void { 157 | _ = allocator; 158 | 159 | var syncsocket: [2]os.fd_t = undefined; 160 | if (linux.socketpair(linux.AF.UNIX, linux.SOCK.STREAM, 0, syncsocket) < 0) { 161 | log.debug("socketpair failed\n", .{}); 162 | os.exit(1); 163 | } 164 | 165 | var cpid = os.fork() catch { 166 | log.debug("fork failed\n", .{}); 167 | os.exit(1); 168 | }; 169 | 170 | if (cpid == 0) { // child 171 | var syncfd = syncsocket[0]; 172 | 173 | // At first, unshare user namespace 174 | if (linux.unshare(linux.CLONE.NEWUSER) == -1) { 175 | log.debug("unshare failed\n", .{}); 176 | os.exit(1); 177 | } 178 | 179 | var synctag: []const u8 = &[_]u8{@intCast(u8, @enumToInt(sync_t.SYNC_USERMAP_PLS))}; 180 | if (os.write(syncfd, synctag)) |size| { 181 | if (size != 1) { 182 | return error.Unexpected; 183 | } 184 | } else |err| { 185 | return err; 186 | } 187 | var buf: [1]u8 = undefined; 188 | if (os.read(syncfd, &buf)) |size| { 189 | if (size != 1) { 190 | return error.Unexpected; 191 | } 192 | } else |err| { 193 | return err; 194 | } 195 | switch (@intToEnum(sync_t, @intCast(c_int, buf[0]))) { 196 | .SYNC_USERMAP_ACK => {}, 197 | else => unreachable, 198 | } 199 | 200 | // become root 201 | if (linux.setresuid(0, 0, 0) == -1) { 202 | log.debug("setresuid failed\n", .{}); 203 | return error.Unexpected; 204 | } 205 | 206 | // unshare remaining namespaces 207 | const flags = linux.CLONE.NEWIPC | linux.CLONE.NEWNET | linux.CLONE.NEWUTS | linux.CLONE.NEWPID | linux.CLONE.NEWNS; 208 | if (linux.unshare(flags) == -1) { 209 | log.debug("unshare failed\n", .{}); 210 | os.exit(1); 211 | } 212 | 213 | var gcpid = os.fork() catch { 214 | log.debug("CHILD: fork failed\n", .{}); 215 | os.exit(1); 216 | }; 217 | 218 | if (gcpid == 0) { // grandchild 219 | const child_args = [_:null]?[*:0]const u8{ "/proc/self/exe", "init", null }; 220 | const envp = [_:null]?[*:0]const u8{null}; 221 | return os.execveZ("/proc/self/exe", &child_args, &envp); 222 | } else { // child 223 | log.debug("CHILD: grandchild pid: {}\n", .{gcpid}); 224 | log.debug("CHILD: waiting for grandchild\n", .{}); 225 | var result = os.waitpid(gcpid, 0); // i'm not sure how to handle WaitPidResult.status with zig, there's no macro like WIFEXITED 226 | _ = result.status; 227 | os.exit(0); 228 | } 229 | } else { // parent 230 | var syncfd = syncsocket[1]; 231 | 232 | log.debug("PARENT: parent pid: {}\n", .{linux.getpid()}); 233 | log.debug("PARENT: child pid: {}\n", .{cpid}); 234 | 235 | var buf: [1]u8 = undefined; 236 | if (os.read(syncfd, &buf)) |size| { 237 | log.debug("PARENT: read {} bytes\n", .{size}); 238 | if (size != 1) { 239 | return error.Unexpected; 240 | } 241 | } else |err| { 242 | return err; 243 | } 244 | switch (@intToEnum(sync_t, @intCast(c_int, buf[0]))) { 245 | .SYNC_USERMAP_PLS => log.debug("PARENT: received SYNC_USERMAP_PLS from child\n", .{}), 246 | else => unreachable, 247 | } 248 | 249 | // https://man7.org/linux/man-pages/man7/user_namespaces.7.html#:~:text=User%20and%20group%20ID%20mappings%3A%20uid_map%20and%20gid_map 250 | // uid_map and gid_map are only writable from parent process. 251 | var uid = linux.getuid(); 252 | //const uid = 1000; 253 | var gid = linux.getgid(); 254 | //const gid = 1000; 255 | 256 | log.debug("PARENT: uid: {}, gid: {}\n", .{ uid, gid }); 257 | 258 | var string_pid = try fmt.allocPrint(allocator, "{}", .{cpid}); 259 | defer allocator.free(string_pid); 260 | var uid_map_path = try fs.path.join(allocator, &[_][]const u8{ "/proc", string_pid, "uid_map" }); 261 | defer allocator.free(uid_map_path); 262 | var gid_map_path = try fs.path.join(allocator, &[_][]const u8{ "/proc", string_pid, "gid_map" }); 263 | defer allocator.free(gid_map_path); 264 | 265 | log.debug("PARENT: uid_map_path: {s}\n", .{uid_map_path}); 266 | log.debug("PARENT: gid_map_path: {s}\n", .{gid_map_path}); 267 | 268 | var uid_map = try fs.openFileAbsolute(uid_map_path, .{ .read = true, .write = true }); 269 | defer uid_map.close(); 270 | var gid_map = try fs.openFileAbsolute(gid_map_path, .{ .read = true, .write = true }); 271 | defer gid_map.close(); 272 | 273 | var uid_map_contents = try fmt.allocPrint(allocator, "0 {} 1\n", .{uid}); 274 | defer allocator.free(uid_map_contents); 275 | var gid_map_contents = try fmt.allocPrint(allocator, "0 {} 1\n", .{gid}); 276 | defer allocator.free(gid_map_contents); 277 | 278 | try uid_map.writer().writeAll(uid_map_contents); 279 | try gid_map.writer().writeAll(gid_map_contents); 280 | 281 | var synctag: []const u8 = &[_]u8{@intCast(u8, @enumToInt(sync_t.SYNC_USERMAP_ACK))}; 282 | log.debug("PARENT: sending SYNC_USERMAP_ACK to child\n", .{}); 283 | if (os.write(syncfd, synctag)) |size| { 284 | log.debug("PARENT: wrote {} bytes\n", .{size}); 285 | if (size != 1) { 286 | return error.Unexpected; 287 | } 288 | } else |err| { 289 | return err; 290 | } 291 | 292 | log.debug("PARENT: wait for child\n", .{}); 293 | var result = os.waitpid(cpid, 0); // i'm not sure how to handle WaitPidResult.status with zig, there's no macro like WIFEXITED 294 | _ = result.status; 295 | 296 | log.debug("parent exited\n", .{}); 297 | } 298 | } 299 | 300 | // TODO(musaprg): error handling 301 | fn state(allocator: Allocator, container_id: []const u8) !void { 302 | _ = container_id; 303 | _ = allocator; 304 | } 305 | 306 | // TODO(musaprg): error handling 307 | fn create(allocator: Allocator, root_path: []const u8, container_id: []const u8, bundle_path: []const u8) !void { 308 | _ = container_id; 309 | _ = bundle_path; 310 | const container_root_path = fs.path.join(allocator, &[_][]const u8{ root_path, container_id }) catch |err| { 311 | log.debug("failed to join container root path: {}\n", .{err}); 312 | return err; 313 | }; 314 | 315 | util.mkdirAll(root_path, 0700) catch |err| { 316 | switch (err) { 317 | error.PathAlreadyExists => {}, 318 | else => { 319 | log.debug("mkdir failed: {}\n", .{err}); 320 | return err; 321 | }, 322 | } 323 | }; 324 | 325 | if (fs.accessAbsolute(container_root_path, .{})) { 326 | return error.FileExists; 327 | } else |err| switch (err) { 328 | else => return err, 329 | } 330 | 331 | try util.mkdirAll(container_root_path, 0700); 332 | 333 | // TODO(musaprg): create process 334 | } 335 | 336 | // TODO(musaprg): error handling 337 | fn start(allocator: Allocator, container_id: []const u8) !void { 338 | _ = container_id; 339 | _ = allocator; 340 | } 341 | 342 | // TODO(musaprg): error handling 343 | fn kill(allocator: Allocator, container_id: []const u8, signal: []const u8) !void { 344 | _ = container_id; 345 | _ = signal; 346 | _ = allocator; 347 | } 348 | 349 | // TODO(musaprg): error handling 350 | fn delete(allocator: Allocator, container_id: []const u8) !void { 351 | _ = container_id; 352 | _ = allocator; 353 | } 354 | 355 | // Use fork and unshare to create a new process with a new PID 356 | // youki: https://github.com/containers/youki/blob/619ae7d1eccbd82fd116465ed25ef410ace2a2a1/crates/libcontainer/src/process/container_main_process.rs#L206-L240 357 | pub fn main() anyerror!void { 358 | var arena = heap.ArenaAllocator.init(std.heap.page_allocator); 359 | defer arena.deinit(); 360 | const allocator = arena.allocator(); 361 | 362 | var app = Yazap.init(allocator, "runzigc", "a simple container runtime written in zig"); 363 | defer app.deinit(); 364 | 365 | var parser = app.rootCommand(); 366 | try parser.addArg(flag.boolean("version", 'v', "Print the version")); 367 | try parser.addSubcommand(app.createCommand("init", "(deprecated) initialize container")); 368 | try parser.addSubcommand(app.createCommand("run", "(deprecated) run a command inside the container")); 369 | 370 | var subcmd_state = app.createCommand("state", "display container state"); 371 | try subcmd_state.takesSingleValue("CONTAINER_ID"); 372 | // subcmd_state.argRequired(false); 373 | try parser.addSubcommand(subcmd_state); 374 | 375 | var subcmd_start = app.createCommand("start", "start a container"); 376 | try subcmd_start.takesSingleValue("CONTAINER_ID"); 377 | try parser.addSubcommand(subcmd_start); 378 | 379 | var subcmd_create = app.createCommand("create", "create a container"); 380 | try subcmd_create.takesSingleValue("CONTAINER_ID"); 381 | try subcmd_create.takesSingleValue("BUNDLE_PATH"); 382 | try parser.addSubcommand(subcmd_create); 383 | 384 | var subcmd_kill = app.createCommand("kill", "send a signal to the container's init process"); 385 | try subcmd_kill.takesSingleValue("CONTAINER_ID"); 386 | try subcmd_kill.takesSingleValue("SIGNAL"); 387 | try parser.addSubcommand(subcmd_kill); 388 | 389 | var subcmd_delete = app.createCommand("delete", "delete a container"); 390 | try subcmd_delete.takesSingleValue("CONTAINER_ID"); 391 | try parser.addSubcommand(subcmd_delete); 392 | 393 | var args = try app.parseProcess(); 394 | 395 | if (!(args.hasArgs())) { 396 | try app.displayHelp(); 397 | app.deinit(); 398 | os.exit(1); 399 | } 400 | 401 | // TODO(musaprg): automatic generation 402 | const version = "v0.0.0"; 403 | if (args.isPresent("version")) { 404 | debug.print("runzigc version {s}\n", .{version}); 405 | return; 406 | } 407 | 408 | // Deprecated commands 409 | if (args.isPresent("init")) { 410 | log.debug("init\n", .{}); 411 | return try init(allocator, "hogecontainer"); 412 | } else if (args.isPresent("run")) { 413 | log.debug("run\n", .{}); 414 | return try run(allocator); 415 | } 416 | 417 | if (args.subcommandContext("state")) |sub_args| { 418 | log.debug("state\n", .{}); 419 | 420 | if (sub_args.isPresent("CONTAINER_ID")) { 421 | const container_id = sub_args.valueOf("CONTAINER_ID").?; 422 | 423 | log.debug("CONTAINER_ID={s}", .{container_id}); 424 | return; 425 | } 426 | 427 | try app.displaySubcommandHelp(); 428 | os.exit(1); 429 | } 430 | 431 | if (args.subcommandContext("start")) |sub_args| { 432 | log.debug("start\n", .{}); 433 | 434 | if (sub_args.isPresent("CONTAINER_ID")) { 435 | const container_id = sub_args.valueOf("CONTAINER_ID").?; 436 | 437 | log.debug("CONTAINER_ID={s}", .{container_id}); 438 | return; 439 | } 440 | try app.displaySubcommandHelp(); 441 | os.exit(1); 442 | } 443 | 444 | if (args.subcommandContext("create")) |sub_args| { 445 | log.debug("create\n", .{}); 446 | 447 | if (sub_args.isPresent("CONTAINER_ID") and sub_args.isPresent("BUNDLE_PATH")) { 448 | const container_id = sub_args.valueOf("CONTAINER_ID").?; 449 | const bundle_path = sub_args.valueOf("BUNDLE_PATH").?; 450 | 451 | log.debug("CONTAINER_ID={s}, BUNDLE_PATH={s}", .{ container_id, bundle_path }); 452 | return try create(allocator, default_root_path, container_id, bundle_path); 453 | } 454 | try app.displaySubcommandHelp(); 455 | os.exit(1); 456 | } 457 | 458 | if (args.subcommandContext("kill")) |sub_args| { 459 | log.debug("kill\n", .{}); 460 | if (sub_args.isPresent("CONTAINER_ID") and sub_args.isPresent("SIGNAL")) { 461 | const container_id = sub_args.valueOf("CONTAINER_ID").?; 462 | const signal = sub_args.valueOf("SIGNAL").?; 463 | 464 | log.debug("CONTAINER_ID={s}, SIGNAL={s}", .{ container_id, signal }); 465 | return; 466 | } 467 | try app.displaySubcommandHelp(); 468 | os.exit(1); 469 | } 470 | 471 | if (args.subcommandContext("delete")) |sub_args| { 472 | log.debug("delete\n", .{}); 473 | if (sub_args.isPresent("CONTAINER_ID")) { 474 | const container_id = sub_args.valueOf("CONTAINER_ID").?; 475 | 476 | log.debug("CONTAINER_ID={s}", .{container_id}); 477 | return; 478 | } 479 | try app.displaySubcommandHelp(); 480 | os.exit(1); 481 | } 482 | os.exit(1); 483 | } 484 | -------------------------------------------------------------------------------- /src/syscall.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | const os = std.os; 4 | const fs = std.fs; 5 | const fmt = std.fmt; 6 | const mem = std.mem; 7 | const heap = std.heap; 8 | const linux = os.linux; 9 | const log = std.log; 10 | const debug = std.debug; 11 | const native_arch = builtin.cpu.arch; 12 | 13 | pub const LinuxKernelErrorBase = error{ OperationNotPermitted, NoSuchFileOrDirectory, NoSuchProcess, InterruptedSystemCall, IOError, NoSuchDeviceOrAddress, ArgumentListTooLong, ExecFormatError, BadFileNumber, NoChildProcesses, TryAgain, OutOfMemory, PermissionDenied, BadAddress, BlockDeviceRequired, DeviceOrResourceBusy, FileExists, CrossDeviceLink, NoSuchDevice, NotADirectory, IsADirectory, InvalidArgument, FileTableOverflow, TooManyOpenFiles, NotATypewriter, TextFileBusy, FileTooLarge, NoSpaceLeftOnDevice, IllegalSeek, ReadOnlyFileSystem, TooManyLinks, BrokenPipe, MathArgumentOutOfDomainOfFunc, MathResultNotRepresentable }; 14 | pub const LinuxKernelError = LinuxKernelErrorBase || os.UnexpectedError; 15 | 16 | pub fn valOrErr(val: anytype, errno: usize) LinuxKernelError!@TypeOf(val) { 17 | return switch (os.errno(errno)) { 18 | .SUCCESS => val, 19 | .PERM => error.OperationNotPermitted, 20 | .NOENT => error.NoSuchFileOrDirectory, 21 | .SRCH => error.NoSuchProcess, 22 | .INTR => error.InterruptedSystemCall, 23 | .IO => error.IOError, 24 | .NXIO => error.NoSuchDeviceOrAddress, 25 | .@"2BIG" => error.ArgumentListTooLong, 26 | .NOEXEC => error.ExecFormatError, 27 | .BADF => error.BadFileNumber, 28 | .CHILD => error.NoChildProcesses, 29 | .AGAIN => error.TryAgain, 30 | .NOMEM => error.OutOfMemory, 31 | .ACCES => error.PermissionDenied, 32 | .FAULT => error.BadAddress, 33 | .BUSY => error.DeviceOrResourceBusy, 34 | .EXIST => error.FileExists, 35 | .XDEV => error.CrossDeviceLink, 36 | .NODEV => error.NoSuchDevice, 37 | .NOTDIR => error.NotADirectory, 38 | .ISDIR => error.IsADirectory, 39 | .INVAL => error.InvalidArgument, 40 | .NFILE => error.FileTableOverflow, 41 | .MFILE => error.TooManyOpenFiles, 42 | .NOTTY => error.NotATypewriter, 43 | .TXTBSY => error.TextFileBusy, 44 | .FBIG => error.FileTooLarge, 45 | .NOSPC => error.NoSpaceLeftOnDevice, 46 | .SPIPE => error.IllegalSeek, 47 | .ROFS => error.ReadOnlyFileSystem, 48 | .MLINK => error.TooManyLinks, 49 | .PIPE => error.BrokenPipe, 50 | .DOM => error.MathArgumentOutOfDomainOfFunc, 51 | .RANGE => error.MathResultNotRepresentable, 52 | else => |e| return os.unexpectedErrno(e), 53 | }; 54 | } 55 | 56 | pub const MountFlags = enum(u32) { 57 | MS_NOSUID = 0x2, 58 | MS_NODEV = 0x4, 59 | MS_NOEXEC = 0x8, 60 | MS_BIND = 0x1000, 61 | MS_REC = 0x4000, 62 | MS_PRIVATE = 0x40000, 63 | MS_SLAVE = 0x80000, 64 | }; 65 | 66 | pub const MountError = LinuxKernelError; 67 | 68 | pub const UmountFlags = enum(u32) { 69 | MNT_FORCE = 0x1, 70 | MNT_DETACH = 0x2, 71 | MNT_EXPIRE = 0x4, 72 | }; 73 | 74 | pub const UmountError = LinuxKernelError; 75 | 76 | pub fn mount(special: [*:0]const u8, dir: [*:0]const u8, fstype: [*:0]const u8, flags: u32, data: usize) MountError!void { 77 | const result = linux.syscall5(.mount, @ptrToInt(special), @ptrToInt(dir), @ptrToInt(fstype), flags, data); 78 | return valOrErr({}, result); 79 | } 80 | 81 | pub fn umount(special: [*:0]const u8, flags: ?u32) UmountError!void { 82 | var result: usize = undefined; 83 | if (flags) |unwrapped_flags| { 84 | result = linux.syscall2(.umount2, @ptrToInt(special), unwrapped_flags); 85 | } else { 86 | result = linux.syscall2(.umount2, @ptrToInt(special), 0); 87 | } 88 | return valOrErr({}, result); 89 | } 90 | 91 | pub const PivotRootError = LinuxKernelError; 92 | 93 | pub fn pivot_root(new_root: []const u8, put_old: []const u8) PivotRootError!void { 94 | const result = switch (native_arch) { 95 | else => linux.syscall2(.pivot_root, @ptrToInt(new_root.ptr), @ptrToInt(put_old.ptr)), 96 | }; 97 | return valOrErr({}, result); 98 | } 99 | 100 | pub const SetHostNameError = LinuxKernelError; 101 | 102 | // TODO(musaprg): dirty hack, fix it 103 | pub fn sethostname(hostname: []const u8) SetHostNameError!void { 104 | const result = switch (native_arch) { 105 | else => linux.syscall2(.sethostname, @ptrToInt(hostname.ptr), hostname.len), 106 | }; 107 | return valOrErr({}, result); 108 | } 109 | 110 | pub const SetsidError = LinuxKernelError; 111 | 112 | pub fn setsid() SetsidError!void { 113 | const result = switch (native_arch) { 114 | else => linux.syscall0(.setsid), 115 | }; 116 | return valOrErr({}, result); 117 | } 118 | 119 | pub const StatError = LinuxKernelError; 120 | 121 | pub const StatMode = enum(u32) { 122 | S_IFMT = 0o170000, 123 | S_IFSOCK = 0o140000, 124 | S_IFLNK = 0o120000, 125 | S_IFREG = 0o100000, 126 | S_IFBLK = 0o060000, 127 | S_IFDIR = 0o040000, 128 | S_IFCHR = 0o020000, 129 | S_IFIFO = 0o010000, 130 | S_ISUID = 0o004000, 131 | S_ISGID = 0o002000, 132 | S_ISVTX = 0o001000, 133 | S_IRWXU = 0o000700, 134 | S_IRUSR = 0o000400, 135 | S_IWUSR = 0o000200, 136 | S_IXUSR = 0o000100, 137 | S_IRWXG = 0o000070, 138 | S_IRGRP = 0o000040, 139 | S_IWGRP = 0o000020, 140 | S_IXGRP = 0o000010, 141 | S_IRWXO = 0o000007, 142 | S_IROTH = 0o000004, 143 | S_IWOTH = 0o000002, 144 | S_IXOTH = 0o000001, 145 | }; 146 | 147 | pub fn stat(path: []const u8) StatError!linux.Stat { 148 | var stat_result: linux.Stat = undefined; 149 | 150 | // https://cs.opensource.google/go/go/+/refs/tags/go1.19:src/syscall/syscall_linux_amd64.go;drc=ea9c3fd42d94182ce6f87104b68a51ea92f1a571;l=58 151 | const err = fstatat(linux.AT.FDCWD, path, &stat_result, linux.AT.SYMLINK_NOFOLLOW); 152 | 153 | return valOrErr(stat_result, err); 154 | } 155 | 156 | fn fstatat(fd: os.fd_t, path: []const u8, stat_info: *linux.Stat, flags: u32) usize { 157 | // https://cs.opensource.google/go/go/+/refs/tags/go1.19:src/syscall/zsyscall_linux_arm64.go 158 | return switch (native_arch) { 159 | .x86_64 => linux.syscall6(.fstatat, @ptrToInt(&fd), @ptrToInt(path.ptr), @ptrToInt(stat_info), flags, 0, 0), 160 | .aarch64 => linux.syscall6(.fstatat, @ptrToInt(&fd), @ptrToInt(path.ptr), @ptrToInt(stat_info), flags, 0, 0), 161 | .i386, .arm => linux.syscall6(.fstatat64, @ptrToInt(&fd), @ptrToInt(path.ptr), @ptrToInt(stat_info), flags, 0, 0), 162 | else => @compileError("Unsupported architecture"), 163 | }; 164 | } 165 | 166 | pub fn isDir(stat_info: linux.Stat) bool { 167 | return stat_info.mode & @enumToInt(StatMode.S_IFMT) == @enumToInt(StatMode.S_IFDIR); 168 | } 169 | -------------------------------------------------------------------------------- /src/util.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | const os = std.os; 4 | const fs = std.fs; 5 | const fmt = std.fmt; 6 | const mem = std.mem; 7 | const Allocator = mem.Allocator; 8 | const heap = std.heap; 9 | const linux = os.linux; 10 | const log = std.log; 11 | const debug = std.debug; 12 | const rand = std.rand; 13 | const native_arch = builtin.cpu.arch; 14 | const testing = std.testing; 15 | 16 | const syscall = @import("syscall.zig"); 17 | 18 | // TODO(musaprg): refactor this dirty line 19 | pub const MkdirAllError = os.MakeDirError || syscall.LinuxKernelError; 20 | 21 | pub fn mkdirAll(path: []const u8, mode: u32) MkdirAllError!void { 22 | // Fast path learned from Go source 23 | // https://cs.opensource.google/go/go/+/refs/tags/go1.19:src/os/path.go;l=18 24 | // TODO(musaprg): use os.fstatat(os.AT.FDCWD, path, 0) instead 25 | if (syscall.stat(path)) |s| { 26 | if (syscall.isDir(s)) { 27 | log.debug("mkdirAll: path already exists: {s}", .{path}); 28 | return; 29 | } 30 | return error.PathAlreadyExists; 31 | } else |err| switch (err) { 32 | error.FileExists => {}, 33 | error.NoSuchFileOrDirectory => { 34 | if (fs.path.dirname(path)) |parent| { 35 | try mkdirAll(parent, mode); 36 | } 37 | os.mkdir(path, mode) catch |e| switch (e) { 38 | error.PathAlreadyExists => {}, 39 | else => return e, 40 | }; 41 | }, 42 | error.PermissionDenied => { 43 | return error.AccessDenied; 44 | }, 45 | else => |e| return e, 46 | } 47 | } 48 | 49 | test "make multiple directories" { 50 | const dir_parent = "/tmp/zig-test-mkdirs"; 51 | // TODO(musaprg): consider using path join function 52 | const dir_child = dir_parent ++ "/child"; 53 | const mode = 0o755; 54 | try mkdirAll(dir_child, mode); 55 | defer { 56 | os.rmdir(dir_child) catch {}; 57 | os.rmdir(dir_parent) catch {}; 58 | } 59 | { 60 | const r = os.access(dir_parent, os.F_OK) catch |err| return err; 61 | try testing.expect(r == {}); 62 | } 63 | { 64 | const r = os.access(dir_parent, os.R_OK) catch |err| return err; 65 | try testing.expect(r == {}); 66 | } 67 | { 68 | const r = os.access(dir_parent, os.W_OK) catch |err| return err; 69 | try testing.expect(r == {}); 70 | } 71 | { 72 | const r = os.access(dir_parent, os.X_OK) catch |err| return err; 73 | try testing.expect(r == {}); 74 | } 75 | { 76 | const r = os.access(dir_child, os.F_OK) catch |err| return err; 77 | try testing.expect(r == {}); 78 | } 79 | { 80 | const r = os.access(dir_child, os.R_OK) catch |err| return err; 81 | try testing.expect(r == {}); 82 | } 83 | { 84 | const r = os.access(dir_child, os.W_OK) catch |err| return err; 85 | try testing.expect(r == {}); 86 | } 87 | { 88 | const r = os.access(dir_child, os.X_OK) catch |err| return err; 89 | try testing.expect(r == {}); 90 | } 91 | } 92 | 93 | pub const MkdirTempError = MkdirAllError || RandomStringError; 94 | 95 | /// Create temporary directory with random name to the specified path 96 | pub fn mkdirTemp(allocator: Allocator, dir: []const u8) MkdirTempError![]const u8 { 97 | const now = std.time.timestamp(); 98 | 99 | var parent_path = dir; 100 | if (dir.len == 0) { 101 | // TODO(musaprg): avoid hard-coding 102 | parent_path = "/tmp"; 103 | } 104 | var prng = rand.DefaultPrng.init(@intCast(u64, now)); 105 | const random = prng.random(); 106 | const max_retry = 10; 107 | const length = 10; 108 | for ([_]u0{0} ** max_retry) |_| { 109 | const file_name = try randomString(allocator, random, length); 110 | log.debug("mkdirTemp: try to create '{s}'", .{file_name}); 111 | const path = try fs.path.join(allocator, &[_][]const u8{ parent_path, file_name }); 112 | mkdirAll(path, 0o0700) catch |err| switch (err) { 113 | error.PathAlreadyExists => continue, 114 | else => return err, 115 | }; 116 | return path; 117 | } 118 | return error.PathAlreadyExists; 119 | } 120 | 121 | test "mkdirTemp" { 122 | var arena = std.heap.ArenaAllocator.init(testing.allocator); 123 | defer arena.deinit(); 124 | var allocator = arena.allocator(); 125 | const path = try mkdirTemp(allocator, ""); 126 | defer { 127 | fs.deleteDirAbsolute(path) catch {}; 128 | } 129 | const s = try syscall.stat(path); 130 | try testing.expect(syscall.isDir(s)); 131 | } 132 | 133 | pub const CreateTempFileError = fs.File.OpenError || RandomStringError; 134 | 135 | /// Create temporary file with random name in the specified dir 136 | pub fn createTempFile(allocator: Allocator, dir: []const u8) CreateTempFileError![]const u8 { 137 | const now = std.time.timestamp(); 138 | 139 | var parent_path = dir; 140 | if (dir.len == 0) { 141 | // TODO(musaprg): avoid hard-coding 142 | parent_path = "/tmp"; 143 | } 144 | var prng = rand.DefaultPrng.init(@intCast(u64, now)); 145 | const random = prng.random(); 146 | const max_retry = 10; 147 | const length = 10; 148 | for ([_]u0{0} ** max_retry) |_| { 149 | const file_name = try randomString(allocator, random, length); 150 | log.debug("createTempFile: try to create '{s}'", .{file_name}); 151 | const path = try fs.path.join(allocator, &[_][]const u8{ parent_path, file_name }); 152 | const file = fs.createFileAbsolute(path, .{}) catch |err| switch (err) { 153 | error.PathAlreadyExists => continue, 154 | else => return err, 155 | }; 156 | defer file.close(); 157 | return path; 158 | } 159 | return error.PathAlreadyExists; 160 | } 161 | 162 | test "createTempFile" { 163 | var arena = std.heap.ArenaAllocator.init(testing.allocator); 164 | defer arena.deinit(); 165 | var allocator = arena.allocator(); 166 | const path = try createTempFile(allocator, ""); 167 | defer { 168 | fs.deleteFileAbsolute(path) catch {}; 169 | } 170 | const s = try syscall.stat(path); 171 | try testing.expect(!syscall.isDir(s)); 172 | } 173 | 174 | pub const RandomStringError = mem.Allocator.Error; 175 | 176 | /// Generate random string 177 | pub fn randomString(allocator: Allocator, random: rand.Random, n: usize) RandomStringError![]const u8 { 178 | const chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 179 | 180 | var items = try allocator.alloc(u8, n); 181 | for (items) |*item| { 182 | const random_pos = random.intRangeLessThan(usize, 0, chars.len); 183 | item.* = chars[random_pos]; 184 | } 185 | 186 | return items; 187 | } 188 | 189 | test "randomString" { 190 | var arena = std.heap.ArenaAllocator.init(testing.allocator); 191 | defer arena.deinit(); 192 | var allocator = arena.allocator(); 193 | var prng = rand.DefaultPrng.init(0); 194 | const random = prng.random(); 195 | _ = try randomString(allocator, random, 10); 196 | } 197 | -------------------------------------------------------------------------------- /testdata/sample_spec_linux.json: -------------------------------------------------------------------------------- 1 | { 2 | "ociVersion": "1.0.1", 3 | "process": { 4 | "terminal": true, 5 | "user": { 6 | "uid": 1, 7 | "gid": 1, 8 | "additionalGids": [ 9 | 5, 10 | 6 11 | ] 12 | }, 13 | "args": [ 14 | "sh" 15 | ], 16 | "env": [ 17 | "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 18 | "TERM=xterm" 19 | ], 20 | "cwd": "/", 21 | "capabilities": { 22 | "bounding": [ 23 | "CAP_AUDIT_WRITE", 24 | "CAP_KILL", 25 | "CAP_NET_BIND_SERVICE" 26 | ], 27 | "permitted": [ 28 | "CAP_AUDIT_WRITE", 29 | "CAP_KILL", 30 | "CAP_NET_BIND_SERVICE" 31 | ], 32 | "inheritable": [ 33 | "CAP_AUDIT_WRITE", 34 | "CAP_KILL", 35 | "CAP_NET_BIND_SERVICE" 36 | ], 37 | "effective": [ 38 | "CAP_AUDIT_WRITE", 39 | "CAP_KILL" 40 | ], 41 | "ambient": [ 42 | "CAP_NET_BIND_SERVICE" 43 | ] 44 | }, 45 | "rlimits": [ 46 | { 47 | "type": "RLIMIT_CORE", 48 | "hard": 1024, 49 | "soft": 1024 50 | }, 51 | { 52 | "type": "RLIMIT_NOFILE", 53 | "hard": 1024, 54 | "soft": 1024 55 | } 56 | ], 57 | "apparmorProfile": "acme_secure_profile", 58 | "oomScoreAdj": 100, 59 | "selinuxLabel": "system_u:system_r:svirt_lxc_net_t:s0:c124,c675", 60 | "noNewPrivileges": true 61 | }, 62 | "root": { 63 | "path": "rootfs", 64 | "readonly": true 65 | }, 66 | "hostname": "slartibartfast", 67 | "mounts": [ 68 | { 69 | "destination": "/proc", 70 | "type": "proc", 71 | "source": "proc" 72 | }, 73 | { 74 | "destination": "/dev", 75 | "type": "tmpfs", 76 | "source": "tmpfs", 77 | "options": [ 78 | "nosuid", 79 | "strictatime", 80 | "mode=755", 81 | "size=65536k" 82 | ] 83 | }, 84 | { 85 | "destination": "/dev/pts", 86 | "type": "devpts", 87 | "source": "devpts", 88 | "options": [ 89 | "nosuid", 90 | "noexec", 91 | "newinstance", 92 | "ptmxmode=0666", 93 | "mode=0620", 94 | "gid=5" 95 | ] 96 | }, 97 | { 98 | "destination": "/dev/shm", 99 | "type": "tmpfs", 100 | "source": "shm", 101 | "options": [ 102 | "nosuid", 103 | "noexec", 104 | "nodev", 105 | "mode=1777", 106 | "size=65536k" 107 | ] 108 | }, 109 | { 110 | "destination": "/dev/mqueue", 111 | "type": "mqueue", 112 | "source": "mqueue", 113 | "options": [ 114 | "nosuid", 115 | "noexec", 116 | "nodev" 117 | ] 118 | }, 119 | { 120 | "destination": "/sys", 121 | "type": "sysfs", 122 | "source": "sysfs", 123 | "options": [ 124 | "nosuid", 125 | "noexec", 126 | "nodev" 127 | ] 128 | }, 129 | { 130 | "destination": "/sys/fs/cgroup", 131 | "type": "cgroup", 132 | "source": "cgroup", 133 | "options": [ 134 | "nosuid", 135 | "noexec", 136 | "nodev", 137 | "relatime", 138 | "ro" 139 | ] 140 | } 141 | ], 142 | "hooks": { 143 | "prestart": [ 144 | { 145 | "path": "/usr/bin/fix-mounts", 146 | "args": [ 147 | "fix-mounts", 148 | "arg1", 149 | "arg2" 150 | ], 151 | "env": [ 152 | "key1=value1" 153 | ] 154 | }, 155 | { 156 | "path": "/usr/bin/setup-network" 157 | } 158 | ], 159 | "poststart": [ 160 | { 161 | "path": "/usr/bin/notify-start", 162 | "timeout": 5 163 | } 164 | ], 165 | "poststop": [ 166 | { 167 | "path": "/usr/sbin/cleanup.sh", 168 | "args": [ 169 | "cleanup.sh", 170 | "-f" 171 | ] 172 | } 173 | ] 174 | }, 175 | "linux": { 176 | "devices": [ 177 | { 178 | "path": "/dev/fuse", 179 | "type": "c", 180 | "major": 10, 181 | "minor": 229, 182 | "fileMode": 438, 183 | "uid": 0, 184 | "gid": 0 185 | }, 186 | { 187 | "path": "/dev/sda", 188 | "type": "b", 189 | "major": 8, 190 | "minor": 0, 191 | "fileMode": 432, 192 | "uid": 0, 193 | "gid": 0 194 | } 195 | ], 196 | "uidMappings": [ 197 | { 198 | "containerID": 0, 199 | "hostID": 1000, 200 | "size": 32000 201 | } 202 | ], 203 | "gidMappings": [ 204 | { 205 | "containerID": 0, 206 | "hostID": 1000, 207 | "size": 32000 208 | } 209 | ], 210 | "sysctl": { 211 | "net.ipv4.ip_forward": "1", 212 | "net.core.somaxconn": "256" 213 | }, 214 | "cgroupsPath": "/myRuntime/myContainer", 215 | "resources": { 216 | "network": { 217 | "classID": 1048577, 218 | "priorities": [ 219 | { 220 | "name": "eth0", 221 | "priority": 500 222 | }, 223 | { 224 | "name": "eth1", 225 | "priority": 1000 226 | } 227 | ] 228 | }, 229 | "pids": { 230 | "limit": 32771 231 | }, 232 | "hugepageLimits": [ 233 | { 234 | "pageSize": "2MB", 235 | "limit": 9223372036854772000 236 | }, 237 | { 238 | "pageSize": "64KB", 239 | "limit": 1000000 240 | } 241 | ], 242 | "memory": { 243 | "limit": 536870912, 244 | "reservation": 536870912, 245 | "swap": 536870912, 246 | "kernel": -1, 247 | "kernelTCP": -1, 248 | "swappiness": 0, 249 | "disableOOMKiller": false 250 | }, 251 | "cpu": { 252 | "shares": 1024, 253 | "quota": 1000000, 254 | "period": 500000, 255 | "realtimeRuntime": 950000, 256 | "realtimePeriod": 1000000, 257 | "cpus": "2-3", 258 | "idle": 1, 259 | "mems": "0-7" 260 | }, 261 | "devices": [ 262 | { 263 | "allow": false, 264 | "access": "rwm" 265 | }, 266 | { 267 | "allow": true, 268 | "type": "c", 269 | "major": 10, 270 | "minor": 229, 271 | "access": "rw" 272 | }, 273 | { 274 | "allow": true, 275 | "type": "b", 276 | "major": 8, 277 | "minor": 0, 278 | "access": "r" 279 | } 280 | ], 281 | "blockIO": { 282 | "weight": 10, 283 | "leafWeight": 10, 284 | "weightDevice": [ 285 | { 286 | "major": 8, 287 | "minor": 0, 288 | "weight": 500, 289 | "leafWeight": 300 290 | }, 291 | { 292 | "major": 8, 293 | "minor": 16, 294 | "weight": 500 295 | } 296 | ], 297 | "throttleReadBpsDevice": [ 298 | { 299 | "major": 8, 300 | "minor": 0, 301 | "rate": 600 302 | } 303 | ], 304 | "throttleWriteIOPSDevice": [ 305 | { 306 | "major": 8, 307 | "minor": 16, 308 | "rate": 300 309 | } 310 | ] 311 | } 312 | }, 313 | "rootfsPropagation": "slave", 314 | "seccomp": { 315 | "defaultAction": "SCMP_ACT_ALLOW", 316 | "architectures": [ 317 | "SCMP_ARCH_X86", 318 | "SCMP_ARCH_X32" 319 | ], 320 | "syscalls": [ 321 | { 322 | "names": [ 323 | "getcwd", 324 | "chmod" 325 | ], 326 | "action": "SCMP_ACT_ERRNO" 327 | } 328 | ] 329 | }, 330 | "namespaces": [ 331 | { 332 | "type": "pid" 333 | }, 334 | { 335 | "type": "network" 336 | }, 337 | { 338 | "type": "ipc" 339 | }, 340 | { 341 | "type": "uts" 342 | }, 343 | { 344 | "type": "mount" 345 | }, 346 | { 347 | "type": "user" 348 | }, 349 | { 350 | "type": "cgroup" 351 | } 352 | ], 353 | "maskedPaths": [ 354 | "/proc/kcore", 355 | "/proc/latency_stats", 356 | "/proc/timer_stats", 357 | "/proc/sched_debug" 358 | ], 359 | "readonlyPaths": [ 360 | "/proc/asound", 361 | "/proc/bus", 362 | "/proc/fs", 363 | "/proc/irq", 364 | "/proc/sys", 365 | "/proc/sysrq-trigger" 366 | ], 367 | "mountLabel": "system_u:object_r:svirt_sandbox_file_t:s0:c715,c811" 368 | }, 369 | "annotations": { 370 | "com.example.key1": "value1", 371 | "com.example.key2": "value2" 372 | } 373 | } -------------------------------------------------------------------------------- /testdata/state.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 3 | "init_process_pid": 393, 4 | "init_process_start": 3164, 5 | "created": "2022-09-18T06:36:31.3214015Z", 6 | "rootless": false, 7 | "cgroup_paths": { 8 | "": "/sys/fs/cgroup/unified/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 9 | "blkio": "/sys/fs/cgroup/blkio/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 10 | "cpu": "/sys/fs/cgroup/cpu/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 11 | "cpuacct": "/sys/fs/cgroup/cpuacct/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 12 | "cpuset": "/sys/fs/cgroup/cpuset/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 13 | "devices": "/sys/fs/cgroup/devices/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 14 | "freezer": "/sys/fs/cgroup/freezer/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 15 | "hugetlb": "/sys/fs/cgroup/hugetlb/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 16 | "memory": "/sys/fs/cgroup/memory/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 17 | "net_cls": "/sys/fs/cgroup/net_cls/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 18 | "net_prio": "/sys/fs/cgroup/net_prio/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 19 | "perf_event": "/sys/fs/cgroup/perf_event/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 20 | "pids": "/sys/fs/cgroup/pids/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af", 21 | "rdma": "/sys/fs/cgroup/rdma/runzigc/bd63ddfb3fda11986b6caa3a85aa6ac6a7def43e0d3298956e3891b91804a1af" 22 | }, 23 | "namespace_paths": { 24 | "NEWCGROUP": "/proc/393/ns/cgroup", 25 | "NEWIPC": "/proc/393/ns/ipc", 26 | "NEWNET": "/proc/393/ns/net", 27 | "NEWNS": "/proc/393/ns/mnt", 28 | "NEWPID": "/proc/393/ns/pid", 29 | "NEWUSER": "/proc/393/ns/user", 30 | "NEWUTS": "/proc/393/ns/uts" 31 | }, 32 | "external_descriptors": [ 33 | "/dev/null", 34 | "pipe:[32782]", 35 | "pipe:[32783]" 36 | ], 37 | "config": { 38 | "ociVersion": "1.0.1", 39 | "process": { 40 | "terminal": true, 41 | "user": { 42 | "uid": 1, 43 | "gid": 1, 44 | "additionalGids": [ 45 | 5, 46 | 6 47 | ] 48 | }, 49 | "args": [ 50 | "sh" 51 | ], 52 | "env": [ 53 | "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 54 | "TERM=xterm" 55 | ], 56 | "cwd": "/", 57 | "capabilities": { 58 | "bounding": [ 59 | "CAP_AUDIT_WRITE", 60 | "CAP_KILL", 61 | "CAP_NET_BIND_SERVICE" 62 | ], 63 | "permitted": [ 64 | "CAP_AUDIT_WRITE", 65 | "CAP_KILL", 66 | "CAP_NET_BIND_SERVICE" 67 | ], 68 | "inheritable": [ 69 | "CAP_AUDIT_WRITE", 70 | "CAP_KILL", 71 | "CAP_NET_BIND_SERVICE" 72 | ], 73 | "effective": [ 74 | "CAP_AUDIT_WRITE", 75 | "CAP_KILL" 76 | ], 77 | "ambient": [ 78 | "CAP_NET_BIND_SERVICE" 79 | ] 80 | }, 81 | "rlimits": [ 82 | { 83 | "type": "RLIMIT_CORE", 84 | "hard": 1024, 85 | "soft": 1024 86 | }, 87 | { 88 | "type": "RLIMIT_NOFILE", 89 | "hard": 1024, 90 | "soft": 1024 91 | } 92 | ], 93 | "apparmorProfile": "acme_secure_profile", 94 | "oomScoreAdj": 100, 95 | "selinuxLabel": "system_u:system_r:svirt_lxc_net_t:s0:c124,c675", 96 | "noNewPrivileges": true 97 | }, 98 | "root": { 99 | "path": "rootfs", 100 | "readonly": true 101 | }, 102 | "hostname": "slartibartfast", 103 | "mounts": [ 104 | { 105 | "destination": "/proc", 106 | "type": "proc", 107 | "source": "proc" 108 | }, 109 | { 110 | "destination": "/dev", 111 | "type": "tmpfs", 112 | "source": "tmpfs", 113 | "options": [ 114 | "nosuid", 115 | "strictatime", 116 | "mode=755", 117 | "size=65536k" 118 | ] 119 | }, 120 | { 121 | "destination": "/dev/pts", 122 | "type": "devpts", 123 | "source": "devpts", 124 | "options": [ 125 | "nosuid", 126 | "noexec", 127 | "newinstance", 128 | "ptmxmode=0666", 129 | "mode=0620", 130 | "gid=5" 131 | ] 132 | }, 133 | { 134 | "destination": "/dev/shm", 135 | "type": "tmpfs", 136 | "source": "shm", 137 | "options": [ 138 | "nosuid", 139 | "noexec", 140 | "nodev", 141 | "mode=1777", 142 | "size=65536k" 143 | ] 144 | }, 145 | { 146 | "destination": "/dev/mqueue", 147 | "type": "mqueue", 148 | "source": "mqueue", 149 | "options": [ 150 | "nosuid", 151 | "noexec", 152 | "nodev" 153 | ] 154 | }, 155 | { 156 | "destination": "/sys", 157 | "type": "sysfs", 158 | "source": "sysfs", 159 | "options": [ 160 | "nosuid", 161 | "noexec", 162 | "nodev" 163 | ] 164 | }, 165 | { 166 | "destination": "/sys/fs/cgroup", 167 | "type": "cgroup", 168 | "source": "cgroup", 169 | "options": [ 170 | "nosuid", 171 | "noexec", 172 | "nodev", 173 | "relatime", 174 | "ro" 175 | ] 176 | } 177 | ], 178 | "hooks": { 179 | "prestart": [ 180 | { 181 | "path": "/usr/bin/fix-mounts", 182 | "args": [ 183 | "fix-mounts", 184 | "arg1", 185 | "arg2" 186 | ], 187 | "env": [ 188 | "key1=value1" 189 | ] 190 | }, 191 | { 192 | "path": "/usr/bin/setup-network" 193 | } 194 | ], 195 | "poststart": [ 196 | { 197 | "path": "/usr/bin/notify-start", 198 | "timeout": 5 199 | } 200 | ], 201 | "poststop": [ 202 | { 203 | "path": "/usr/sbin/cleanup.sh", 204 | "args": [ 205 | "cleanup.sh", 206 | "-f" 207 | ] 208 | } 209 | ] 210 | }, 211 | "linux": { 212 | "devices": [ 213 | { 214 | "path": "/dev/fuse", 215 | "type": "c", 216 | "major": 10, 217 | "minor": 229, 218 | "fileMode": 438, 219 | "uid": 0, 220 | "gid": 0 221 | }, 222 | { 223 | "path": "/dev/sda", 224 | "type": "b", 225 | "major": 8, 226 | "minor": 0, 227 | "fileMode": 432, 228 | "uid": 0, 229 | "gid": 0 230 | } 231 | ], 232 | "uidMappings": [ 233 | { 234 | "containerID": 0, 235 | "hostID": 1000, 236 | "size": 32000 237 | } 238 | ], 239 | "gidMappings": [ 240 | { 241 | "containerID": 0, 242 | "hostID": 1000, 243 | "size": 32000 244 | } 245 | ], 246 | "sysctl": { 247 | "net.ipv4.ip_forward": "1", 248 | "net.core.somaxconn": "256" 249 | }, 250 | "cgroupsPath": "/myRuntime/myContainer", 251 | "resources": { 252 | "network": { 253 | "classID": 1048577, 254 | "priorities": [ 255 | { 256 | "name": "eth0", 257 | "priority": 500 258 | }, 259 | { 260 | "name": "eth1", 261 | "priority": 1000 262 | } 263 | ] 264 | }, 265 | "pids": { 266 | "limit": 32771 267 | }, 268 | "hugepageLimits": [ 269 | { 270 | "pageSize": "2MB", 271 | "limit": 9223372036854772000 272 | }, 273 | { 274 | "pageSize": "64KB", 275 | "limit": 1000000 276 | } 277 | ], 278 | "memory": { 279 | "limit": 536870912, 280 | "reservation": 536870912, 281 | "swap": 536870912, 282 | "kernel": -1, 283 | "kernelTCP": -1, 284 | "swappiness": 0, 285 | "disableOOMKiller": false 286 | }, 287 | "cpu": { 288 | "shares": 1024, 289 | "quota": 1000000, 290 | "period": 500000, 291 | "realtimeRuntime": 950000, 292 | "realtimePeriod": 1000000, 293 | "cpus": "2-3", 294 | "idle": 1, 295 | "mems": "0-7" 296 | }, 297 | "devices": [ 298 | { 299 | "allow": false, 300 | "access": "rwm" 301 | }, 302 | { 303 | "allow": true, 304 | "type": "c", 305 | "major": 10, 306 | "minor": 229, 307 | "access": "rw" 308 | }, 309 | { 310 | "allow": true, 311 | "type": "b", 312 | "major": 8, 313 | "minor": 0, 314 | "access": "r" 315 | } 316 | ], 317 | "blockIO": { 318 | "weight": 10, 319 | "leafWeight": 10, 320 | "weightDevice": [ 321 | { 322 | "major": 8, 323 | "minor": 0, 324 | "weight": 500, 325 | "leafWeight": 300 326 | }, 327 | { 328 | "major": 8, 329 | "minor": 16, 330 | "weight": 500 331 | } 332 | ], 333 | "throttleReadBpsDevice": [ 334 | { 335 | "major": 8, 336 | "minor": 0, 337 | "rate": 600 338 | } 339 | ], 340 | "throttleWriteIOPSDevice": [ 341 | { 342 | "major": 8, 343 | "minor": 16, 344 | "rate": 300 345 | } 346 | ] 347 | } 348 | }, 349 | "rootfsPropagation": "slave", 350 | "seccomp": { 351 | "defaultAction": "SCMP_ACT_ALLOW", 352 | "architectures": [ 353 | "SCMP_ARCH_X86", 354 | "SCMP_ARCH_X32" 355 | ], 356 | "syscalls": [ 357 | { 358 | "names": [ 359 | "getcwd", 360 | "chmod" 361 | ], 362 | "action": "SCMP_ACT_ERRNO" 363 | } 364 | ] 365 | }, 366 | "namespaces": [ 367 | { 368 | "type": "pid" 369 | }, 370 | { 371 | "type": "network" 372 | }, 373 | { 374 | "type": "ipc" 375 | }, 376 | { 377 | "type": "uts" 378 | }, 379 | { 380 | "type": "mount" 381 | }, 382 | { 383 | "type": "user" 384 | }, 385 | { 386 | "type": "cgroup" 387 | } 388 | ], 389 | "maskedPaths": [ 390 | "/proc/kcore", 391 | "/proc/latency_stats", 392 | "/proc/timer_stats", 393 | "/proc/sched_debug" 394 | ], 395 | "readonlyPaths": [ 396 | "/proc/asound", 397 | "/proc/bus", 398 | "/proc/fs", 399 | "/proc/irq", 400 | "/proc/sys", 401 | "/proc/sysrq-trigger" 402 | ], 403 | "mountLabel": "system_u:object_r:svirt_sandbox_file_t:s0:c715,c811" 404 | }, 405 | "annotations": { 406 | "com.example.key1": "value1", 407 | "com.example.key2": "value2" 408 | } 409 | } 410 | } --------------------------------------------------------------------------------