├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── build.rs ├── build.sh ├── build_seccomp.sh ├── oci ├── .gitignore ├── Cargo.toml └── src │ ├── lib.rs │ └── serialize.rs ├── railcar.png ├── rustfmt.toml ├── src ├── capabilities.rs ├── cgroups.rs ├── errors.rs ├── logger.rs ├── main.rs ├── mounts.rs ├── nix_ext.rs ├── seccomp.rs ├── selinux.rs ├── signals.rs └── sync.rs └── wercker.yml /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | railcar 4 | .wercker 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libseccomp"] 2 | path = libseccomp 3 | url = https://github.com/seccomp/libseccomp 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Railcar # 2 | 3 | *Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.* 4 | 5 | Pull requests can be made under 6 | [The Oracle Contributor Agreement](https://www.oracle.com/technetwork/community/oca-486395.html) (OCA). 7 | 8 | For pull requests to be accepted, the bottom of 9 | your commit message must have the following line using your name and 10 | e-mail address as it appears in the OCA Signatories list. 11 | 12 | ``` 13 | Signed-off-by: Your Name 14 | ``` 15 | 16 | This can be automatically added to pull requests by committing with: 17 | 18 | ``` 19 | git commit --signoff 20 | ``` 21 | 22 | Only pull requests from committers that can be verified as having 23 | signed the OCA can be accepted. 24 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "railcar" 3 | version = "1.0.4" 4 | authors = ["Vishvananda Ishaya Abrams "] 5 | build = "build.rs" 6 | 7 | [dependencies] 8 | caps = "0.2.0" 9 | clap = "2.24.1" 10 | error-chain = "0.10.0" 11 | lazy_static = "^1.1.0" 12 | libc = "0.2.21" 13 | log = {version = "0.4.3", features = ["release_max_level_info"] } 14 | nix = "0.11.0" 15 | num-traits = "^0.2.5" 16 | oci = { path = "oci" } 17 | prctl = "1.0.0" 18 | scopeguard = "^0.3.3" 19 | seccomp-sys = "0.1.2" 20 | 21 | [features] 22 | nightly = [] 23 | 24 | [profile.release] 25 | lto = true 26 | panic = 'abort' 27 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 2 | 3 | This software is dual-licensed to you under the Universal Permissive License (UPL) or the Apache License 2.0 or both. See below for license terms. 4 | ____________________________ 5 | The Universal Permissive License (UPL), Version 1.0 6 | Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 7 | 8 | Subject to the condition set forth below, permission is hereby granted to any person obtaining a copy of this software, associated documentation and/or data (collectively the "Software"), free of charge and under any and all copyright rights in the Software, and any and all patent rights owned or freely licensable by each licensor hereunder covering either (i) the unmodified Software as contributed to or provided by such licensor, or (ii) the Larger Works (as defined below), to deal in both 9 | 10 | (a) the Software, and 11 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if one is included with the Software (each a "Larger Work" to which the Software is contributed by such licensors), 12 | 13 | without restriction, including without limitation the rights to copy, create derivative works of, display, perform, and distribute the Software and make, use, sell, offer for sale, import, export, have made, and have sold the Software and the Larger Work(s), and to sublicense the foregoing rights on either these or other terms. 14 | 15 | This license is subject to the following condition: 16 | 17 | The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must be included in all copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | The Apache Software License, Version 2.0 22 | Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved. 23 | 24 | Licensed under the Apache License, Version 2.0 (the "License"); You may not use this product except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. A copy of the license is also reproduced below. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 25 | 26 | Apache License 27 | 28 | Version 2.0, January 2004 29 | 30 | http://www.apache.org/licenses/ 31 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 32 | 1. Definitions. 33 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 34 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 35 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 36 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 37 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 38 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 39 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 40 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 41 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 42 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 43 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 44 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 45 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 46 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 47 | You must cause any modified files to carry prominent notices stating that You changed the files; and 48 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 49 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 50 | 51 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 52 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 53 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 54 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 55 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 56 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 57 | END OF TERMS AND CONDITIONS 58 | 59 | APPENDIX: How to apply the Apache License to your work. 60 | 61 | To apply the Apache License to your work, attach the following 62 | boilerplate notice, with the fields enclosed by brackets "[]" 63 | replaced with your own identifying information. (Don't include 64 | the brackets!) The text should be enclosed in the appropriate 65 | comment syntax for the file format. We also recommend that a 66 | file or class name and description of purpose be included on the 67 | same "printed page" as the copyright notice for easier 68 | identification within third-party archives. 69 | 70 | Copyright [yyyy] [name of copyright owner] 71 | 72 | Licensed under the Apache License, Version 2.0 (the "License"); 73 | you may not use this file except in compliance with the License. 74 | You may obtain a copy of the License at 75 | 76 | http://www.apache.org/licenses/LICENSE-2.0 77 | 78 | Unless required by applicable law or agreed to in writing, software 79 | distributed under the License is distributed on an "AS IS" BASIS, 80 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 81 | See the License for the specific language governing permissions and 82 | limitations under the License. 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `railcar` - rust implementation of the oci-runtime spec # 2 | 3 | ![railcar](https://github.com/oracle/railcar/raw/master/railcar.png 4 | "railcar") 5 | 6 | ## What is `railcar`? ## 7 | 8 | `railcar` is a rust implementation of the [opencontainers 9 | initiative](https://www.opencontainers.org/)'s [runtime 10 | spec](https://github.com/opencontainers/runtime-spec). It is similar to the 11 | reference implementation `runc`, but it is implemented completely in rust for 12 | memory safety without needing the overhead of a garbage collector or multiple 13 | threads. For more information on the development of railcar, check out 14 | [Building a Container Runtime in 15 | Rust](https://blogs.oracle.com/developers/building-a-container-runtime-in-rust) 16 | 17 | ## Building `railcar` ## 18 | 19 | [![wercker status](https://app.wercker.com/status/730e874772dc02c6005f4ae4e42b0ca4/s/master "wercker status")](https://app.wercker.com/project/byKey/730e874772dc02c6005f4ae4e42b0ca4) 20 | 21 | Install rust: 22 | 23 | curl https://sh.rustup.rs -sSf | sh 24 | cargo install cargo-when 25 | rustup toolchain install stable-x86_64-unknown-linux-gnu 26 | rustup default stable-x86_64-unknown-linux-gnu # for stable 27 | rustup target install x86_64-unknown-linux-musl # for stable 28 | rustup toolchain install nightly-x86_64-unknown-linux-gnu 29 | rustup default nightly-x86_64-unknown-linux-gnu # for nightly 30 | rustup target install x86_64-unknown-linux-musl # for nightly 31 | 32 | Building can be done via build.sh: 33 | 34 | build.sh 35 | 36 | By default, build.sh builds a dynamic binary using gnu. To build a static 37 | binary, set `TARGET` to `x86_64-unknown-linux-musl`: 38 | 39 | TARGET=x86_64-unknown-linux-musl ./build.sh 40 | 41 | Build requirements for TARGET=x86_64-unknown-linux-gnu: 42 | 43 | libseccomp-devel 44 | 45 | Build requirements for TARGET=x86_64-unknown-linux-musl: 46 | 47 | git submodule update --init 48 | autotools 49 | make 50 | gcc 51 | musl-gcc 52 | 53 | To build a release version: 54 | 55 | build.sh --release 56 | 57 | If you build using stable instead of nightly, the set_name feature will be 58 | disabled and the init process inside the container will not be named rc-init 59 | when viewed via ps or /proc/$pid/cmdline. 60 | 61 | ## Using `railcar` ## 62 | 63 | ./railcar run 64 | 65 | You can specify a different bundle directory where your config.json is 66 | located with -b: 67 | 68 | ./railcar -b /some/other/directory run 69 | 70 | ## Using `railcar` with docker ## 71 | 72 | `railcar` can be used as a backend for docker. To use it, start the docker 73 | daemon with an additional backend: 74 | 75 | dockerd ... --experimental --add-runtime "rc=/path/to/railcar" 76 | 77 | Then you can use `railcar` by specifying the `rc` backend: 78 | 79 | docker run -it --rm --runtime rc hello 80 | 81 | Note that you should start the daemon with a terminal (the -t option) so that 82 | docker can properly collect stdout and stderr from `railcar`. If you want to 83 | daemonize the container, just use: 84 | 85 | docker run -dt --rm --runtime rc hello 86 | 87 | ## Differences from `runc` ## 88 | 89 | In general, `railcar` is very similar to `runc`, but some of the `runc` 90 | commands are not supported. Currently, `railcar` does not support the following 91 | commands: 92 | 93 | checkpoint 94 | events 95 | exec 96 | init 97 | list 98 | pause 99 | restore 100 | resume 101 | spec 102 | 103 | Also, `railcar` always runs an init process separately from the container 104 | process. 105 | 106 | ## Contributing ## 107 | 108 | `railcar` is an open source project. See [CONTRIBUTING](CONTRIBUTING.md) for 109 | details. 110 | 111 | Oracle gratefully acknowledges the contributions to railcar that have been made 112 | by the community. 113 | 114 | ## Getting in touch ## 115 | 116 | The best way to get in touch is Slack. 117 | 118 | Click [here](https://join.slack.com/t/oraclecontainertools/shared_invite/enQtMzIwNzg3NDIzMzE5LTIwMjZlODllMWRmNjMwZGM1NGNjMThlZjg3ZmU3NDY1ZWU5ZGJmZWFkOTBjNzk0ODIxNzQ2ODUyNThiNmE0MmI) to join the the [Oracle Container Tools workspace](https://oraclecontainertools.slack.com). 119 | 120 | Then join the [Railcar channel](https://oraclecontainertools.slack.com/messages/C8BP6MEA0). 121 | 122 | ## License ## 123 | 124 | Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 125 | 126 | `railcar` is dual licensed under the Universal Permissive License 1.0 and the 127 | Apache License 2.0. 128 | 129 | See [LICENSE](LICENSE.txt) for more details. 130 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs::File; 3 | use std::io::Read; 4 | use std::process::Command; 5 | 6 | fn main() { 7 | // static link the musl target 8 | if env::var("TARGET").unwrap() == "x86_64-unknown-linux-musl" { 9 | let mut cmd = Command::new("./build_seccomp.sh"); 10 | let output = cmd.output().expect("cmd failed to start"); 11 | if !output.status.success() { 12 | println!( 13 | "failed to build libseccomp:\n{}\n{}", 14 | &std::str::from_utf8(&output.stdout).unwrap(), 15 | &std::str::from_utf8(&output.stderr).unwrap() 16 | ); 17 | let mut f = File::open("libseccomp/config.log").unwrap(); 18 | let mut result = String::new(); 19 | f.read_to_string(&mut result).unwrap(); 20 | println!{"{}", &result}; 21 | std::process::exit(1); 22 | } 23 | 24 | let pwd = std::env::var("PWD").unwrap(); 25 | let dir = format!("{}/libseccomp/src/.libs", pwd); 26 | println!("cargo:rustc-link-search=native={}", dir); 27 | println!("cargo:rustc-link-lib=static=seccomp"); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TARGET=${TARGET-x86_64-unknown-linux-gnu} 3 | if [ "$TARGET" != "" ]; then 4 | TGT="--target $TARGET" 5 | fi 6 | VERSION=debug 7 | if [[ "$1" == "--release" ]]; then 8 | VERSION=release 9 | fi 10 | cargo when --channel=stable build --verbose $TGT $1 && \ 11 | cargo when --channel=beta build --verbose $TGT $1 && \ 12 | cargo when --channel=nightly build --verbose --features nightly $TGT $1 && \ 13 | rm -f railcar 14 | cp target/$TARGET/$VERSION/railcar . 15 | -------------------------------------------------------------------------------- /build_seccomp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | function die { 6 | echo $1 7 | exit 1 8 | } 9 | 10 | [ -e libseccomp/autogen.sh ] || die "libseccomp empty, did you git submodule update --init" 11 | cd libseccomp 12 | [ -e ./configure ] || ./autogen.sh || die "autogen failed, check dependencies" 13 | [ -e ./Makefile ] || CC="musl-gcc -pie -fPIC" CPPFLAGS="-idirafter/usr/include/x86_64-linux-gnu -idirafter/usr/include" ./configure --enable-static 14 | make || die "make failed, check dependencies" 15 | echo "$PWD/src/.libs" 16 | -------------------------------------------------------------------------------- /oci/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | target 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /oci/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "oci" 3 | version = "0.1.0" 4 | authors = ["Vishvananda Ishaya Abrams "] 5 | 6 | [dependencies] 7 | serde = "0.9" 8 | serde_json = "0.9" 9 | serde_derive = "0.9" 10 | -------------------------------------------------------------------------------- /oci/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_camel_case_types)] 2 | #[macro_use] 3 | extern crate serde_derive; 4 | extern crate serde_json; 5 | extern crate serde; 6 | //extern crate nix; 7 | 8 | pub mod serialize; 9 | 10 | use std::collections::HashMap; 11 | use std::io::Write; 12 | 13 | use serde_json::Value; 14 | 15 | //use nix::unistd::{Gid,Pid,Uid}; 16 | 17 | fn is_false(b: &bool) -> bool { 18 | !b 19 | } 20 | 21 | #[derive(Serialize, Deserialize, Debug)] 22 | pub struct Platform { 23 | #[serde(default, skip_serializing_if = "String::is_empty")] 24 | pub os: String, 25 | #[serde(default, skip_serializing_if = "String::is_empty")] 26 | pub arch: String, 27 | } 28 | 29 | #[derive(Default, PartialEq, Serialize, Deserialize, Debug)] 30 | pub struct Box { 31 | #[serde(default)] 32 | pub height: u64, 33 | #[serde(default)] 34 | pub width: u64, 35 | } 36 | 37 | fn is_default(b: &T) -> bool { 38 | *b == T::default() 39 | } 40 | 41 | 42 | #[derive(Serialize, Deserialize, Debug)] 43 | pub struct User { 44 | #[serde(default)] 45 | pub uid: u32, 46 | #[serde(default)] 47 | pub gid: u32, 48 | #[serde(default, skip_serializing_if = "Vec::is_empty", 49 | rename = "additionalGids")] 50 | pub additional_gids: Vec, 51 | #[serde(default, skip_serializing_if = "String::is_empty")] 52 | pub username: String, 53 | } 54 | 55 | // this converts directly to the correct int 56 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 57 | pub enum LinuxRlimitType { 58 | RLIMIT_CPU, // CPU time in sec 59 | RLIMIT_FSIZE, // Maximum filesize 60 | RLIMIT_DATA, // max data size 61 | RLIMIT_STACK, // max stack size 62 | RLIMIT_CORE, // max core file size 63 | RLIMIT_RSS, // max resident set size 64 | RLIMIT_NPROC, // max number of processes 65 | RLIMIT_NOFILE, // max number of open files 66 | RLIMIT_MEMLOCK, // max locked-in-memory address space 67 | RLIMIT_AS, // address space limit 68 | RLIMIT_LOCKS, // maximum file locks held 69 | RLIMIT_SIGPENDING, // max number of pending signals 70 | RLIMIT_MSGQUEUE, // maximum bytes in POSIX mqueues 71 | RLIMIT_NICE, // max nice prio allowed to raise to 72 | RLIMIT_RTPRIO, // maximum realtime priority 73 | RLIMIT_RTTIME, // timeout for RT tasks in us 74 | } 75 | 76 | #[derive(Serialize, Deserialize, Debug)] 77 | pub struct LinuxRlimit { 78 | #[serde(rename = "type")] 79 | pub typ: LinuxRlimitType, 80 | #[serde(default)] 81 | pub hard: u64, 82 | #[serde(default)] 83 | pub soft: u64, 84 | } 85 | 86 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 87 | #[repr(u8)] 88 | pub enum LinuxCapabilityType { 89 | CAP_CHOWN, 90 | CAP_DAC_OVERRIDE, 91 | CAP_DAC_READ_SEARCH, 92 | CAP_FOWNER, 93 | CAP_FSETID, 94 | CAP_KILL, 95 | CAP_SETGID, 96 | CAP_SETUID, 97 | CAP_SETPCAP, 98 | CAP_LINUX_IMMUTABLE, 99 | CAP_NET_BIND_SERVICE, 100 | CAP_NET_BROADCAST, 101 | CAP_NET_ADMIN, 102 | CAP_NET_RAW, 103 | CAP_IPC_LOCK, 104 | CAP_IPC_OWNER, 105 | CAP_SYS_MODULE, 106 | CAP_SYS_RAWIO, 107 | CAP_SYS_CHROOT, 108 | CAP_SYS_PTRACE, 109 | CAP_SYS_PACCT, 110 | CAP_SYS_ADMIN, 111 | CAP_SYS_BOOT, 112 | CAP_SYS_NICE, 113 | CAP_SYS_RESOURCE, 114 | CAP_SYS_TIME, 115 | CAP_SYS_TTY_CONFIG, 116 | CAP_MKNOD, 117 | CAP_LEASE, 118 | CAP_AUDIT_WRITE, 119 | CAP_AUDIT_CONTROL, 120 | CAP_SETFCAP, 121 | CAP_MAC_OVERRIDE, 122 | CAP_MAC_ADMIN, 123 | CAP_SYSLOG, 124 | CAP_WAKE_ALARM, 125 | CAP_BLOCK_SUSPEND, 126 | CAP_AUDIT_READ, 127 | } 128 | 129 | #[derive(Serialize, Deserialize, Debug)] 130 | pub struct LinuxCapabilities { 131 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 132 | pub bounding: Vec, 133 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 134 | pub effective: Vec, 135 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 136 | pub inheritable: Vec, 137 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 138 | pub permitted: Vec, 139 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 140 | pub ambient: Vec, 141 | } 142 | 143 | #[derive(Serialize, Deserialize, Debug)] 144 | pub struct Process { 145 | #[serde(default, skip_serializing_if = "is_false")] 146 | pub terminal: bool, 147 | #[serde(default, skip_serializing_if = "is_default", 148 | rename = "consoleSize")] 149 | pub console_size: Box, 150 | pub user: User, 151 | pub args: Vec, 152 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 153 | pub env: Vec, 154 | #[serde(default, skip_serializing_if = "String::is_empty")] 155 | pub cwd: String, 156 | #[serde(default, deserialize_with = "deserialize_capabilities", 157 | skip_serializing_if = "Option::is_none")] 158 | pub capabilities: Option, 159 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 160 | pub rlimits: Vec, 161 | #[serde(default, skip_serializing_if = "is_false", 162 | rename = "noNewPrivileges")] 163 | pub no_new_privileges: bool, 164 | #[serde(default, skip_serializing_if = "String::is_empty", 165 | rename = "apparmorProfile")] 166 | pub apparmor_profile: String, 167 | #[serde(default, skip_serializing_if = "String::is_empty", 168 | rename = "selinuxLabel")] 169 | pub selinux_label: String, 170 | } 171 | 172 | use serde::Deserialize; 173 | 174 | fn cap_from_array( 175 | a: &[serde_json::Value], 176 | ) -> Result, D::Error> 177 | where 178 | D: serde::Deserializer, 179 | { 180 | let mut caps = Vec::new(); 181 | for c in a { 182 | match LinuxCapabilityType::deserialize(c) { 183 | Ok(val) => caps.push(val), 184 | Err(_) => { 185 | let msg = format!("Capability '{}' is not valid", c); 186 | return Err(serde::de::Error::custom(msg)); 187 | } 188 | } 189 | } 190 | Ok(caps) 191 | } 192 | 193 | fn cap_from_object( 194 | o: &serde_json::Map, 195 | key: &str, 196 | ) -> Result, D::Error> 197 | where 198 | D: serde::Deserializer, 199 | { 200 | if let Some(v) = o.get(key) { 201 | match *v { 202 | serde_json::Value::Null => Ok(Vec::new()), 203 | serde_json::Value::Array(ref a) => cap_from_array::(a), 204 | _ => Err(serde::de::Error::custom( 205 | "Unexpected value in capability set", 206 | )), 207 | } 208 | } else { 209 | Ok(Vec::new()) 210 | } 211 | } 212 | 213 | // handle the old case where caps was just a list of caps 214 | fn deserialize_capabilities( 215 | de: D, 216 | ) -> Result, D::Error> 217 | where 218 | D: serde::Deserializer, 219 | { 220 | let r: serde_json::Value = serde::Deserialize::deserialize(de)?; 221 | match r { 222 | serde_json::Value::Null => Ok(None), 223 | serde_json::Value::Array(a) => { 224 | let caps = cap_from_array::(&a)?; 225 | let capabilities = LinuxCapabilities { 226 | bounding: caps.clone(), 227 | effective: caps.clone(), 228 | inheritable: caps.clone(), 229 | permitted: caps.clone(), 230 | ambient: caps.clone(), 231 | }; 232 | 233 | Ok(Some(capabilities)) 234 | } 235 | serde_json::Value::Object(o) => { 236 | let capabilities = LinuxCapabilities{ 237 | bounding: cap_from_object::(&o, "bounding")?, 238 | effective: cap_from_object::(&o, "effective")?, 239 | inheritable: cap_from_object::(&o, "inheritable")?, 240 | permitted: cap_from_object::(&o, "permitted")?, 241 | ambient: cap_from_object::(&o, "ambient")?, 242 | }; 243 | 244 | Ok(Some(capabilities)) 245 | } 246 | _ => Err(serde::de::Error::custom("Unexpected value in capabilites")), 247 | } 248 | } 249 | 250 | #[derive(Serialize, Deserialize, Debug)] 251 | pub struct Root { 252 | #[serde(default)] 253 | pub path: String, 254 | #[serde(default, skip_serializing_if = "is_false")] 255 | pub readonly: bool, 256 | } 257 | 258 | #[derive(Serialize, Deserialize, Debug)] 259 | pub struct Mount { 260 | #[serde(default)] 261 | pub destination: String, 262 | #[serde(default, skip_serializing_if = "String::is_empty", rename = "type")] 263 | pub typ: String, 264 | #[serde(default)] 265 | pub source: String, 266 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 267 | pub options: Vec, 268 | } 269 | 270 | #[derive(Serialize, Deserialize, Debug)] 271 | pub struct Hook { 272 | #[serde(default, skip_serializing_if = "String::is_empty")] 273 | pub path: String, 274 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 275 | pub args: Vec, 276 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 277 | pub env: Vec, 278 | #[serde(skip_serializing_if = "Option::is_none")] 279 | pub timeout: Option, 280 | } 281 | 282 | #[derive(Serialize, Deserialize, Debug)] 283 | pub struct Hooks { 284 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 285 | pub prestart: Vec, 286 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 287 | pub poststart: Vec, 288 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 289 | pub poststop: Vec, 290 | } 291 | 292 | #[derive(Serialize, Deserialize, Debug, Clone)] 293 | pub struct LinuxIDMapping { 294 | #[serde(default, rename = "hostID")] 295 | pub host_id: u32, 296 | #[serde(default, rename = "containerID")] 297 | pub container_id: u32, 298 | #[serde(default)] 299 | pub size: u32, 300 | } 301 | 302 | // a is for LinuxDeviceCgroup 303 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 304 | pub enum LinuxDeviceType { 305 | b, 306 | c, 307 | u, 308 | p, 309 | a, 310 | } 311 | 312 | impl Default for LinuxDeviceType { 313 | fn default() -> LinuxDeviceType { 314 | LinuxDeviceType::a 315 | } 316 | } 317 | 318 | #[derive(Serialize, Deserialize, Debug)] 319 | pub struct LinuxDeviceCgroup { 320 | #[serde(default, skip_serializing_if = "is_false")] 321 | pub allow: bool, 322 | #[serde(default, rename = "type")] 323 | pub typ: LinuxDeviceType, 324 | #[serde(skip_serializing_if = "Option::is_none")] 325 | pub major: Option, 326 | #[serde(skip_serializing_if = "Option::is_none")] 327 | pub minor: Option, 328 | #[serde(default, skip_serializing_if = "String::is_empty")] 329 | pub access: String, 330 | } 331 | 332 | #[derive(Serialize, Deserialize, Debug)] 333 | pub struct LinuxMemory { 334 | #[serde(skip_serializing_if = "Option::is_none")] 335 | pub limit: Option, 336 | #[serde(skip_serializing_if = "Option::is_none")] 337 | pub reservation: Option, 338 | #[serde(skip_serializing_if = "Option::is_none")] 339 | pub swap: Option, 340 | #[serde(skip_serializing_if = "Option::is_none")] 341 | pub kernel: Option, 342 | #[serde(skip_serializing_if = "Option::is_none", rename = "kernelTCP")] 343 | pub kernel_tcp: Option, 344 | #[serde(skip_serializing_if = "Option::is_none")] 345 | pub swappiness: Option, 346 | } 347 | 348 | #[derive(Serialize, Deserialize, Debug)] 349 | pub struct LinuxCPU { 350 | #[serde(skip_serializing_if = "Option::is_none")] 351 | pub shares: Option, 352 | #[serde(skip_serializing_if = "Option::is_none")] 353 | pub quota: Option, 354 | #[serde(skip_serializing_if = "Option::is_none")] 355 | pub period: Option, 356 | #[serde(skip_serializing_if = "Option::is_none", 357 | rename = "realtimeRuntime")] 358 | pub realtime_runtime: Option, 359 | #[serde(skip_serializing_if = "Option::is_none", rename = "realtimePeriod")] 360 | pub realtime_period: Option, 361 | #[serde(default, skip_serializing_if = "String::is_empty")] 362 | pub cpus: String, 363 | #[serde(default, skip_serializing_if = "String::is_empty")] 364 | pub mems: String, 365 | } 366 | 367 | #[derive(Serialize, Deserialize, Debug)] 368 | pub struct LinuxPids { 369 | #[serde(default)] 370 | pub limit: i64, 371 | } 372 | 373 | #[derive(Serialize, Deserialize, Debug)] 374 | pub struct LinuxWeightDevice { 375 | #[serde(default)] 376 | pub major: i64, 377 | #[serde(default)] 378 | pub minor: i64, 379 | #[serde(skip_serializing_if = "Option::is_none")] 380 | pub weight: Option, 381 | #[serde(skip_serializing_if = "Option::is_none", rename = "leafWeight")] 382 | pub leaf_weight: Option, 383 | } 384 | 385 | #[derive(Serialize, Deserialize, Debug)] 386 | pub struct LinuxThrottleDevice { 387 | #[serde(default)] 388 | pub major: i64, 389 | #[serde(default)] 390 | pub minor: i64, 391 | #[serde(default)] 392 | pub rate: u64, 393 | } 394 | 395 | #[derive(Serialize, Deserialize, Debug)] 396 | pub struct LinuxBlockIO { 397 | #[serde(skip_serializing_if = "Option::is_none", rename = "blkioWeight")] 398 | pub weight: Option, 399 | #[serde(skip_serializing_if = "Option::is_none", 400 | rename = "blkioLeafWeight")] 401 | pub leaf_weight: Option, 402 | #[serde(default, skip_serializing_if = "Vec::is_empty", 403 | rename = "blkioWeightDevice")] 404 | pub weight_device: Vec, 405 | #[serde(default, skip_serializing_if = "Vec::is_empty", 406 | rename = "blkioThrottleReadBpsDevice")] 407 | pub throttle_read_bps_device: Vec, 408 | #[serde(default, skip_serializing_if = "Vec::is_empty", 409 | rename = "blkioThrottleWriteBpsDevice")] 410 | pub throttle_write_bps_device: Vec, 411 | #[serde(default, skip_serializing_if = "Vec::is_empty", 412 | rename = "blkioThrottleReadIOPSDevice")] 413 | pub throttle_read_iops_device: Vec, 414 | #[serde(default, skip_serializing_if = "Vec::is_empty", 415 | rename = "blkioThrottleWriteIOPSDevice")] 416 | pub throttle_write_iops_device: Vec, 417 | } 418 | 419 | #[derive(Serialize, Deserialize, Debug)] 420 | pub struct LinuxHugepageLimit { 421 | #[serde(default, skip_serializing_if = "String::is_empty", 422 | rename = "pageSize")] 423 | pub page_size: String, 424 | #[serde(default)] 425 | pub limit: i64, 426 | } 427 | 428 | 429 | #[derive(Serialize, Deserialize, Debug)] 430 | pub struct LinuxInterfacePriority { 431 | #[serde(default, skip_serializing_if = "String::is_empty")] 432 | pub name: String, 433 | #[serde(default)] 434 | pub priority: u32, 435 | } 436 | 437 | #[derive(Serialize, Deserialize, Debug)] 438 | pub struct LinuxNetwork { 439 | #[serde(skip_serializing_if = "Option::is_none", rename = "classID")] 440 | pub class_id: Option, 441 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 442 | pub priorities: Vec, 443 | } 444 | 445 | #[derive(Default, Serialize, Deserialize, Debug)] 446 | pub struct LinuxResources { 447 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 448 | pub devices: Vec, 449 | // NOTE: spec uses a pointer here, so perhaps this should be an Option, but 450 | // false == unset so we don't bother. 451 | #[serde(default, skip_serializing_if = "is_false", 452 | rename = "disableOOMKiller")] 453 | pub disable_oom_killer: bool, 454 | // NOTE: spec refers to this as an isize but the range is -1000 to 1000, so 455 | // an i32 seems just fine 456 | #[serde(skip_serializing_if = "Option::is_none", rename = "oomScoreAdj")] 457 | pub oom_score_adj: Option, 458 | #[serde(skip_serializing_if = "Option::is_none")] 459 | pub memory: Option, 460 | #[serde(skip_serializing_if = "Option::is_none")] 461 | pub cpu: Option, 462 | #[serde(skip_serializing_if = "Option::is_none")] 463 | pub pids: Option, 464 | #[serde(skip_serializing_if = "Option::is_none", rename = "blockIO")] 465 | pub block_io: Option, 466 | #[serde(default, skip_serializing_if = "Vec::is_empty", 467 | rename = "hugepageLimits")] 468 | pub hugepage_limits: Vec, 469 | #[serde(skip_serializing_if = "Option::is_none")] 470 | pub network: Option, 471 | } 472 | 473 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 474 | pub enum LinuxNamespaceType { 475 | mount = 0x00020000, /* New mount namespace group */ 476 | cgroup = 0x02000000, /* New cgroup namespace */ 477 | uts = 0x04000000, /* New utsname namespace */ 478 | ipc = 0x08000000, /* New ipc namespace */ 479 | user = 0x10000000, /* New user namespace */ 480 | pid = 0x20000000, /* New pid namespace */ 481 | network = 0x40000000, /* New network namespace */ 482 | } 483 | 484 | #[derive(Serialize, Deserialize, Debug)] 485 | pub struct LinuxNamespace { 486 | #[serde(rename = "type")] 487 | pub typ: LinuxNamespaceType, 488 | #[serde(default, skip_serializing_if = "String::is_empty")] 489 | pub path: String, 490 | } 491 | 492 | #[derive(Serialize, Deserialize, Debug)] 493 | pub struct LinuxDevice { 494 | #[serde(default, skip_serializing_if = "String::is_empty")] 495 | pub path: String, 496 | #[serde(rename = "type")] 497 | pub typ: LinuxDeviceType, 498 | #[serde(default)] 499 | pub major: u64, 500 | #[serde(default)] 501 | pub minor: u64, 502 | #[serde(skip_serializing_if = "Option::is_none", rename = "fileMode")] 503 | pub file_mode: Option, 504 | #[serde(skip_serializing_if = "Option::is_none")] 505 | pub uid: Option, 506 | #[serde(skip_serializing_if = "Option::is_none")] 507 | pub gid: Option, 508 | } 509 | 510 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 511 | #[repr(u32)] 512 | pub enum LinuxSeccompAction { 513 | SCMP_ACT_KILL = 0x00000000, 514 | SCMP_ACT_TRAP = 0x00030000, 515 | SCMP_ACT_ERRNO = 0x00050001, /* ERRNO + EPERM */ 516 | SCMP_ACT_TRACE = 0x7ff00001, /* TRACE + EPERM */ 517 | SCMP_ACT_ALLOW = 0x7fff0000, 518 | } 519 | 520 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 521 | pub enum Arch { 522 | SCMP_ARCH_NATIVE = 0x00000000, 523 | SCMP_ARCH_X86 = 0x40000003, 524 | SCMP_ARCH_X86_64 = 0xc000003e, 525 | SCMP_ARCH_X32 = 0x4000003e, 526 | SCMP_ARCH_ARM = 0x40000028, 527 | SCMP_ARCH_AARCH64 = 0xc00000b7, 528 | SCMP_ARCH_MIPS = 0x00000008, 529 | SCMP_ARCH_MIPS64 = 0x80000008, 530 | SCMP_ARCH_MIPS64N32 = 0xa0000008, 531 | SCMP_ARCH_MIPSEL = 0x40000008, 532 | SCMP_ARCH_MIPSEL64 = 0xc0000008, 533 | SCMP_ARCH_MIPSEL64N32 = 0xe0000008, 534 | SCMP_ARCH_PPC = 0x00000014, 535 | SCMP_ARCH_PPC64 = 0x80000015, 536 | SCMP_ARCH_PPC64LE = 0xc0000015, 537 | SCMP_ARCH_S390 = 0x00000016, 538 | SCMP_ARCH_S390X = 0x80000016, 539 | } 540 | 541 | #[derive(Serialize, Deserialize, Debug, Clone, Copy)] 542 | #[repr(u32)] 543 | pub enum LinuxSeccompOperator { 544 | SCMP_CMP_NE = 1, /* not equal */ 545 | SCMP_CMP_LT = 2, /* less than */ 546 | SCMP_CMP_LE = 3, /* less than or equal */ 547 | SCMP_CMP_EQ = 4, /* equal */ 548 | SCMP_CMP_GE = 5, /* greater than or equal */ 549 | SCMP_CMP_GT = 6, /* greater than */ 550 | SCMP_CMP_MASKED_EQ = 7, /* masked equality */ 551 | } 552 | 553 | #[derive(Serialize, Deserialize, Debug)] 554 | pub struct LinuxSeccompArg { 555 | #[serde(default)] 556 | pub index: usize, 557 | #[serde(default)] 558 | pub value: u64, 559 | #[serde(default, rename = "valueTwo")] 560 | pub value_two: u64, 561 | pub op: LinuxSeccompOperator, 562 | } 563 | 564 | #[derive(Serialize, Deserialize, Debug)] 565 | pub struct LinuxSyscall { 566 | // old version used name 567 | #[serde(default, skip_serializing_if = "String::is_empty")] 568 | pub name: String, 569 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 570 | pub names: Vec, 571 | pub action: LinuxSeccompAction, 572 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 573 | pub args: Vec, 574 | } 575 | 576 | #[derive(Serialize, Deserialize, Debug)] 577 | pub struct LinuxSeccomp { 578 | #[serde(rename = "defaultAction")] 579 | pub default_action: LinuxSeccompAction, 580 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 581 | pub architectures: Vec, 582 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 583 | pub syscalls: Vec, 584 | } 585 | 586 | #[derive(Serialize, Deserialize, Debug)] 587 | pub struct Linux { 588 | #[serde(default, skip_serializing_if = "Vec::is_empty", 589 | rename = "uidMappings")] 590 | pub uid_mappings: Vec, 591 | #[serde(default, skip_serializing_if = "Vec::is_empty", 592 | rename = "gidMappings")] 593 | pub gid_mappings: Vec, 594 | #[serde(default, skip_serializing_if = "HashMap::is_empty")] 595 | pub sysctl: HashMap, 596 | #[serde(skip_serializing_if = "Option::is_none")] 597 | pub resources: Option, 598 | #[serde(default, skip_serializing_if = "String::is_empty", 599 | rename = "cgroupsPath")] 600 | pub cgroups_path: String, 601 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 602 | pub namespaces: Vec, 603 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 604 | pub devices: Vec, 605 | #[serde(skip_serializing_if = "Option::is_none")] 606 | pub seccomp: Option, 607 | #[serde(default, skip_serializing_if = "String::is_empty", 608 | rename = "rootfsPropagation")] 609 | pub rootfs_propagation: String, 610 | #[serde(default, skip_serializing_if = "Vec::is_empty", 611 | rename = "maskedPaths")] 612 | pub masked_paths: Vec, 613 | #[serde(default, skip_serializing_if = "Vec::is_empty", 614 | rename = "readonlyPaths")] 615 | pub readonly_paths: Vec, 616 | #[serde(default, skip_serializing_if = "String::is_empty", 617 | rename = "mountLabel")] 618 | pub mount_label: String, 619 | } 620 | 621 | // NOTE: Solaris and Windows are ignored for the moment 622 | pub type Solaris = Value; 623 | pub type Windows = Value; 624 | 625 | 626 | #[derive(Serialize, Deserialize, Debug)] 627 | pub struct Spec { 628 | #[serde(default, skip_serializing_if = "String::is_empty", 629 | rename = "ociVersion")] 630 | pub version: String, 631 | // NOTE: Platform was removed, but keeping it as an option 632 | // to support older docker versions 633 | #[serde(skip_serializing_if = "Option::is_none")] 634 | pub platform: Option, 635 | pub process: Process, 636 | pub root: Root, 637 | #[serde(default, skip_serializing_if = "String::is_empty")] 638 | pub hostname: String, 639 | #[serde(default, skip_serializing_if = "Vec::is_empty")] 640 | pub mounts: Vec, 641 | #[serde(skip_serializing_if = "Option::is_none")] 642 | pub hooks: Option, 643 | #[serde(default, skip_serializing_if = "HashMap::is_empty")] 644 | pub annotations: HashMap, 645 | #[serde(skip_serializing_if = "Option::is_none")] 646 | pub linux: Option, 647 | #[serde(skip_serializing_if = "Option::is_none")] 648 | pub solaris: Option, 649 | #[serde(skip_serializing_if = "Option::is_none")] 650 | pub windows: Option, 651 | } 652 | 653 | impl Spec { 654 | pub fn load(path: &str) -> Result { 655 | serialize::deserialize(path) 656 | } 657 | 658 | pub fn save(&self, path: &str) -> Result<(), serialize::SerializeError> { 659 | serialize::serialize(self, path) 660 | } 661 | } 662 | 663 | #[derive(Serialize, Deserialize, Debug)] 664 | pub struct State { 665 | #[serde(default, skip_serializing_if = "String::is_empty", 666 | rename = "ociVersion")] 667 | pub version: String, 668 | #[serde(default, skip_serializing_if = "String::is_empty")] 669 | pub id: String, 670 | #[serde(default, skip_serializing_if = "String::is_empty")] 671 | pub status: String, 672 | #[serde(default)] 673 | pub pid: i32, 674 | #[serde(default, skip_serializing_if = "String::is_empty")] 675 | pub bundle: String, 676 | #[serde(default, skip_serializing_if = "HashMap::is_empty")] 677 | pub annotations: HashMap, 678 | } 679 | 680 | impl State { 681 | pub fn to_string(&self) -> Result { 682 | serialize::to_string(self) 683 | } 684 | 685 | pub fn to_writer( 686 | &self, 687 | mut writer: W, 688 | ) -> Result<(), serialize::SerializeError> { 689 | serialize::to_writer(self, &mut writer) 690 | } 691 | } 692 | -------------------------------------------------------------------------------- /oci/src/serialize.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | use serde; 3 | 4 | use std::fmt; 5 | use std::io; 6 | use std::error::Error; 7 | use std::fs::File; 8 | 9 | #[derive(Debug)] 10 | pub enum SerializeError { 11 | Io(io::Error), 12 | Json(serde_json::Error), 13 | } 14 | 15 | impl fmt::Display for SerializeError { 16 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 17 | match *self { 18 | SerializeError::Io(ref err) => err.fmt(f), 19 | SerializeError::Json(ref err) => err.fmt(f), 20 | } 21 | } 22 | } 23 | 24 | impl Error for SerializeError { 25 | fn description(&self) -> &str { 26 | match *self { 27 | SerializeError::Io(ref err) => err.description(), 28 | SerializeError::Json(ref err) => err.description(), 29 | } 30 | } 31 | 32 | fn cause(&self) -> Option<&Error> { 33 | match *self { 34 | SerializeError::Io(ref err) => Some(err), 35 | SerializeError::Json(ref err) => Some(err), 36 | } 37 | } 38 | } 39 | 40 | impl From for SerializeError { 41 | fn from(err: io::Error) -> SerializeError { 42 | SerializeError::Io(err) 43 | } 44 | } 45 | 46 | impl From for SerializeError { 47 | fn from(err: serde_json::Error) -> SerializeError { 48 | SerializeError::Json(err) 49 | } 50 | } 51 | 52 | pub fn to_writer( 53 | obj: &T, 54 | mut writer: W, 55 | ) -> Result<(), SerializeError> { 56 | Ok(serde_json::to_writer(&mut writer, &obj)?) 57 | } 58 | 59 | // pub fn from_reader(reader: R) 60 | // -> Result { 61 | // Ok(serde_json::from_reader(reader)?) 62 | // } 63 | 64 | pub fn serialize( 65 | obj: &T, 66 | path: &str, 67 | ) -> Result<(), SerializeError> { 68 | let mut file = File::create(path)?; 69 | Ok(serde_json::to_writer(&mut file, &obj)?) 70 | } 71 | 72 | pub fn deserialize( 73 | path: &str, 74 | ) -> Result { 75 | let file = File::open(path)?; 76 | Ok(serde_json::from_reader(&file)?) 77 | } 78 | 79 | pub fn to_string( 80 | obj: &T, 81 | ) -> Result { 82 | Ok(serde_json::to_string(&obj)?) 83 | } 84 | -------------------------------------------------------------------------------- /railcar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle/railcar/ef5918e21e7ad9ffd25c1a507df96458ec4e0c24/railcar.png -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 2 | -------------------------------------------------------------------------------- /src/capabilities.rs: -------------------------------------------------------------------------------- 1 | use caps::*; 2 | use oci::{LinuxCapabilities, LinuxCapabilityType}; 3 | 4 | fn to_cap(cap: LinuxCapabilityType) -> Capability { 5 | unsafe { ::std::mem::transmute(cap) } 6 | } 7 | 8 | fn to_set(caps: &[LinuxCapabilityType]) -> CapsHashSet { 9 | let mut capabilities = CapsHashSet::new(); 10 | for c in caps { 11 | capabilities.insert(to_cap(*c)); 12 | } 13 | capabilities 14 | } 15 | 16 | pub fn reset_effective() -> ::Result<()> { 17 | set(None, CapSet::Effective, ::caps::all())?; 18 | Ok(()) 19 | } 20 | 21 | pub fn drop_privileges(cs: &LinuxCapabilities) -> ::Result<()> { 22 | let all = ::caps::all(); 23 | debug!("dropping bounding capabilities to {:?}", cs.bounding); 24 | // drop excluded caps from the bounding set 25 | for c in all.difference(&to_set(&cs.bounding)) { 26 | drop(None, CapSet::Bounding, *c)?; 27 | } 28 | // set other sets for current process 29 | set(None, CapSet::Effective, to_set(&cs.effective))?; 30 | set(None, CapSet::Permitted, to_set(&cs.permitted))?; 31 | set(None, CapSet::Inheritable, to_set(&cs.inheritable))?; 32 | if let Err(e) = set(None, CapSet::Ambient, to_set(&cs.ambient)) { 33 | warn!("failed to set ambient capabilities: {}", e); 34 | } 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /src/cgroups.rs: -------------------------------------------------------------------------------- 1 | use errors::*; 2 | use lazy_static::initialize; 3 | use nix::unistd::Pid; 4 | use num_traits::identities::Zero; 5 | use oci::LinuxDeviceType; 6 | use oci::{LinuxDeviceCgroup, LinuxResources, LinuxThrottleDevice}; 7 | use std::collections::HashMap; 8 | use std::fs::{create_dir_all, remove_dir, File}; 9 | use std::io::{BufRead, BufReader, Read, Write}; 10 | use std::string::ToString; 11 | 12 | pub fn init() { 13 | // initialize lazy_static maps 14 | initialize(&PATHS); 15 | initialize(&MOUNTS); 16 | initialize(&DEFAULT_ALLOWED_DEVICES); 17 | initialize(&APPLIES); 18 | } 19 | 20 | pub fn apply( 21 | resources: &Option, 22 | pid: &str, 23 | cgroups_path: &str, 24 | ) -> Result<()> { 25 | for key in MOUNTS.keys() { 26 | let dir = if let Some(s) = path(key, cgroups_path) { 27 | s 28 | } else { 29 | continue; 30 | }; 31 | // ensure cgroup dir 32 | debug!{"creating cgroup dir {}", &dir}; 33 | let chain = || format!("create cgroup dir {} failed", &dir); 34 | create_dir_all(&dir).chain_err(chain)?; 35 | // enter cgroups 36 | for k in key.split(',') { 37 | if let Some(cgroup_apply) = APPLIES.get(k) { 38 | if let Some(ref r) = *resources { 39 | cgroup_apply(r, &dir)?; 40 | } else { 41 | // apply with empty resources 42 | cgroup_apply(&LinuxResources::default(), &dir)?; 43 | } 44 | write_file(&dir, "cgroup.procs", pid)?; 45 | } 46 | } 47 | } 48 | Ok(()) 49 | } 50 | 51 | pub fn remove(cgroups_path: &str) -> Result<()> { 52 | for key in MOUNTS.keys() { 53 | let dir = if let Some(s) = path(key, cgroups_path) { 54 | s 55 | } else { 56 | continue; 57 | }; 58 | debug!{"removing cgroup dir {}", &dir}; 59 | // remove cgroup dir 60 | let chain = || format!("remove cgroup dir {} failed", &dir); 61 | remove_dir(&dir).chain_err(chain)?; 62 | } 63 | Ok(()) 64 | } 65 | 66 | #[inline] 67 | fn wrnz( 68 | dir: &str, 69 | key: &str, 70 | value: Option, 71 | ) -> Result<()> { 72 | match value { 73 | Some(val) => { 74 | if !val.is_zero() { 75 | write_file(dir, key, &val.to_string()) 76 | } else { 77 | Ok(()) 78 | } 79 | } 80 | None => Ok(()), 81 | } 82 | } 83 | 84 | #[inline] 85 | fn try_wrnz( 86 | dir: &str, 87 | key: &str, 88 | value: Option, 89 | ) -> Result<()> { 90 | match wrnz(dir, key, value) { 91 | Err(Error(ErrorKind::Io(e), x)) => { 92 | if e.kind() == ::std::io::ErrorKind::PermissionDenied { 93 | warn!{"setting cgroup value {} is not supported", key} 94 | Ok(()) 95 | } else { 96 | Err(Error(ErrorKind::Io(e), x)) 97 | } 98 | } 99 | x => x, 100 | } 101 | } 102 | 103 | pub fn write_file(dir: &str, file: &str, data: &str) -> Result<()> { 104 | let path = format!{"{}/{}", dir, file}; 105 | debug!{"writing {} to {}", data, &path}; 106 | let mut f = File::create(&path)?; 107 | f.write_all(data.as_bytes())?; 108 | Ok(()) 109 | } 110 | 111 | pub fn read_file(dir: &str, file: &str) -> Result<(String)> { 112 | let path = format!{"{}/{}", dir, file}; 113 | let mut f = File::open(&path)?; 114 | let mut result = String::new(); 115 | f.read_to_string(&mut result)?; 116 | debug!{"read {} from {}", &result, &path}; 117 | Ok(result) 118 | } 119 | 120 | pub fn path(key: &str, cgroups_path: &str) -> Option { 121 | let mount = MOUNTS.get(key); 122 | let rel = PATHS.get(key); 123 | if mount.is_none() || rel.is_none() { 124 | None 125 | } else if rel.unwrap() == "/" { 126 | Some(format!{"{}{}", &mount.unwrap(), cgroups_path}) 127 | } else { 128 | Some(format!{"{}{}{}", &mount.unwrap(), &rel.unwrap(), cgroups_path}) 129 | } 130 | } 131 | 132 | pub fn get_procs(key: &str, cgroups_path: &str) -> Vec { 133 | let mut result = Vec::new(); 134 | if let Some(dir) = path(key, cgroups_path) { 135 | let path = format!{"{}/cgroup.procs", dir}; 136 | let f = match File::open(path) { 137 | Ok(f) => f, 138 | Err(e) => { 139 | warn!{"could not cgroup.procs: {}", e}; 140 | return result; 141 | } 142 | }; 143 | for line in BufReader::new(f).lines() { 144 | let l = match line { 145 | Ok(l) => l, 146 | Err(e) => { 147 | warn!("failed to read cgroup info: {}", e); 148 | return result; 149 | } 150 | }; 151 | if let Ok(pid) = l.parse::() { 152 | result.push(Pid::from_raw(pid)); 153 | } 154 | } 155 | } 156 | result 157 | } 158 | 159 | lazy_static! { 160 | pub static ref PATHS: HashMap = { 161 | let mut result = HashMap::new(); 162 | let f = match File::open("/proc/self/cgroup") { 163 | Ok(f) => f, 164 | Err(e) => { 165 | warn!{"could not load cgroup info: {}", e}; 166 | return result; 167 | } 168 | }; 169 | 170 | for line in BufReader::new(f).lines() { 171 | let l = match line { 172 | Ok(l) => l, 173 | Err(e) => { 174 | warn!("failed to read cgroup info: {}", e); 175 | return result; 176 | } 177 | }; 178 | let fields: Vec<&str> = l.split(':').collect(); 179 | if fields.len() != 3 { 180 | warn!("cgroup data is corrupted"); 181 | continue; 182 | } 183 | result.insert(fields[1].to_string(), fields[2].to_string()); 184 | } 185 | 186 | result 187 | }; 188 | } 189 | 190 | lazy_static! { 191 | pub static ref MOUNTS: HashMap = { 192 | let mut result = HashMap::new(); 193 | let f = match File::open("/proc/self/mountinfo") { 194 | Ok(f) => f, 195 | Err(e) => { 196 | warn!{"could not load mount info: {}", e}; 197 | return result; 198 | } 199 | }; 200 | for line in BufReader::new(f).lines() { 201 | let l = match line { 202 | Ok(l) => l, 203 | Err(e) => { 204 | warn!("failed to read mount info: {}", e); 205 | return result; 206 | } 207 | }; 208 | if let Some(sep) = l.find(" - ") { 209 | if l.len() < sep + 10 { 210 | continue; 211 | } 212 | let key = &l[sep + 3..sep + 10]; 213 | if key != "cgroup " && key != "cgroup2" { 214 | continue; 215 | } 216 | let pre: Vec<&str> = l[..sep].split(' ').collect(); 217 | if pre.len() != 7 { 218 | warn!("mountinfo data is corrupted"); 219 | continue; 220 | } 221 | let post: Vec<&str> = l[sep + 3..].split(' ').collect(); 222 | if post.len() != 3 { 223 | warn!("mountinfo data is corrupted"); 224 | continue; 225 | } 226 | let mut offset = post[2].len(); 227 | while let Some(o) = post[2][..offset].rfind(',') { 228 | let name = &post[2][o + 1..]; 229 | if PATHS.contains_key(name) { 230 | result.insert(name.to_string(), pre[4].to_string()); 231 | break; 232 | } 233 | offset = o; 234 | } 235 | } else { 236 | warn!("mountinfo data is corrupted"); 237 | } 238 | } 239 | result 240 | }; 241 | } 242 | 243 | lazy_static! { 244 | static ref DEFAULT_ALLOWED_DEVICES: Vec = { 245 | let mut v = Vec::new(); 246 | // mknod any device 247 | v.push(LinuxDeviceCgroup{ 248 | allow: true, 249 | typ: LinuxDeviceType::c, 250 | major: None, 251 | minor: None, 252 | access: "m".to_string(), 253 | }); 254 | v.push(LinuxDeviceCgroup{ 255 | allow: true, 256 | typ: LinuxDeviceType::b, 257 | major: None, 258 | minor: None, 259 | access: "m".to_string(), 260 | }); 261 | // /dev/console 262 | v.push(LinuxDeviceCgroup{ 263 | allow: true, 264 | typ: LinuxDeviceType::c, 265 | major: Some(5), 266 | minor: Some(1), 267 | access: "rwm".to_string(), 268 | }); 269 | // /dev/pts 270 | v.push(LinuxDeviceCgroup{ 271 | allow: true, 272 | typ: LinuxDeviceType::c, 273 | major: Some(136), 274 | minor: None, 275 | access: "rwm".to_string(), 276 | }); 277 | v.push(LinuxDeviceCgroup{ 278 | allow: true, 279 | typ: LinuxDeviceType::c, 280 | major: Some(5), 281 | minor: Some(2), 282 | access: "rwm".to_string(), 283 | }); 284 | // tun/tap 285 | v.push(LinuxDeviceCgroup{ 286 | allow: true, 287 | typ: LinuxDeviceType::c, 288 | major: Some(10), 289 | minor: Some(200), 290 | access: "rwm".to_string(), 291 | }); 292 | v 293 | }; 294 | } 295 | 296 | type Apply = fn(&LinuxResources, &str) -> Result<()>; 297 | 298 | lazy_static! { 299 | static ref APPLIES: HashMap<&'static str, Apply> = { 300 | let mut m: HashMap<&'static str, Apply> = HashMap::new(); 301 | m.insert("cpuacct", null_apply); // no settings for cpuacct 302 | m.insert("perf_event", null_apply); // no settings for perf_event 303 | m.insert("freezer", null_apply); // no settings for freezer 304 | m.insert("name=systemd", null_apply); // no settings for systemd 305 | m.insert("cpuset", cpuset_apply); 306 | m.insert("cpu", cpu_apply); 307 | m.insert("memory", memory_apply); 308 | m.insert("blkio", blkio_apply); 309 | m.insert("pids", pids_apply); 310 | m.insert("net_cls", net_cls_apply); 311 | m.insert("net_prio", net_prio_apply); 312 | m.insert("hugetlb", hugetlb_apply); 313 | m.insert("devices", devices_apply); 314 | m 315 | }; 316 | } 317 | 318 | fn copy_parent(dir: &str, file: &str) -> Result<()> { 319 | let parent = if let Some(o) = dir.rfind('/') { 320 | &dir[..o] 321 | } else { 322 | bail!{"failed to find {} in parent cgroups", file}; 323 | }; 324 | match read_file(parent, file) { 325 | Err(Error(ErrorKind::Io(e), _)) => { 326 | if e.kind() == ::std::io::ErrorKind::NotFound { 327 | // copy parent and then retry 328 | copy_parent(parent, file)?; 329 | return copy_parent(dir, file); 330 | } 331 | let msg = "failed to copy parent cgroup".to_string(); 332 | Err(e).chain_err(|| msg) 333 | } 334 | Err(e) => Err(e), 335 | Ok(data) => write_file(dir, file, &data), 336 | } 337 | } 338 | 339 | fn null_apply(_: &LinuxResources, _: &str) -> Result<()> { 340 | Ok(()) 341 | } 342 | 343 | fn cpuset_apply(r: &LinuxResources, dir: &str) -> Result<()> { 344 | // cpuset files are required so copy them from the parent 345 | let (cpus, mems) = if let Some(cpu) = r.cpu.as_ref() { 346 | (&cpu.cpus[..], &cpu.mems[..]) 347 | } else { 348 | ("", "") 349 | }; 350 | 351 | if cpus.is_empty() { 352 | copy_parent(dir, "cpuset.cpus")?; 353 | } else { 354 | write_file(dir, "cpuset.cpus", cpus)?; 355 | } 356 | if mems.is_empty() { 357 | copy_parent(dir, "cpuset.mems")?; 358 | } else { 359 | write_file(dir, "cpuset.mems", mems)?; 360 | } 361 | Ok(()) 362 | } 363 | 364 | fn cpu_apply(r: &LinuxResources, dir: &str) -> Result<()> { 365 | if let Some(cpu) = r.cpu.as_ref() { 366 | // NOTE: these values are nullable in the spec, but runc treats 367 | // null as a zero value 368 | try_wrnz(dir, "cpu.rt_period_us", cpu.realtime_period)?; 369 | try_wrnz(dir, "cpu.rt_runtime_us", cpu.realtime_runtime)?; 370 | wrnz(dir, "cpu.shares", cpu.shares)?; 371 | wrnz(dir, "cpu.cfs_quota_us", cpu.quota)?; 372 | wrnz(dir, "cpu.cfs_period_us", cpu.period)?; 373 | }; 374 | Ok(()) 375 | } 376 | fn memory_apply(r: &LinuxResources, dir: &str) -> Result<()> { 377 | // TODO: handle issues with joining an existing namespace 378 | if let Some(memory) = r.memory.as_ref() { 379 | // NOTE: these values are nullable in the spec, but runc treats 380 | // null as a zero value 381 | wrnz(dir, "memory.limit_in_bytes", memory.limit)?; 382 | wrnz(dir, "memory.soft_limit_in_bytes", memory.reservation)?; 383 | // NOTE: these two can be disabled in the kernel, so just warn 384 | // if they are not set 385 | try_wrnz(dir, "memory.memsw.limit_in_bytes", memory.swap)?; 386 | try_wrnz(dir, "memory.kmem.limit_in_bytes", memory.kernel)?; 387 | wrnz(dir, "memory.kmem.tcp.limit_in_bytes", memory.kernel_tcp)?; 388 | if let Some(s) = memory.swappiness { 389 | // NOTE: docker sends an invalid value for swappiness 390 | if s <= 100 { 391 | wrnz(dir, "memory.swappiness", memory.swappiness)?; 392 | } else { 393 | warn!{"memory swappiness invalid, working around bug"}; 394 | } 395 | } 396 | if r.disable_oom_killer { 397 | write_file(dir, "memory.oom_control", "1")?; 398 | } 399 | }; 400 | Ok(()) 401 | } 402 | 403 | #[inline] 404 | fn rate(d: &LinuxThrottleDevice) -> String { 405 | return format!{"{}:{} {}", d.major, d.minor, d.rate}; 406 | } 407 | 408 | fn blkio_apply(r: &LinuxResources, dir: &str) -> Result<()> { 409 | if let Some(blkio) = r.block_io.as_ref() { 410 | // NOTE: these values are nullable in the spec, but runc treats 411 | // null as a zero value 412 | wrnz(dir, "blkio.weight", blkio.weight)?; 413 | wrnz(dir, "blkio.leaf_weight", blkio.leaf_weight)?; 414 | for d in &blkio.weight_device { 415 | // NOTE: runc writes zero values here. This may be a bug, but 416 | // we are duplicating functionality. 417 | if let Some(w) = d.weight { 418 | let weight = format!{"{}:{} {}", d.major, d.minor, w}; 419 | write_file(dir, "blkio.weight_device", &weight)?; 420 | } 421 | if let Some(w) = d.leaf_weight { 422 | let weight = format!{"{}:{} {}", d.major, d.minor, w}; 423 | write_file(dir, "blkio.leaf_weight_device", &weight)?; 424 | } 425 | } 426 | for d in &blkio.throttle_read_bps_device { 427 | write_file(dir, "blkio.throttle.read_bps_device", &rate(d))?; 428 | } 429 | for d in &blkio.throttle_write_bps_device { 430 | write_file(dir, "blkio.throttle.write_bps_device", &rate(d))?; 431 | } 432 | for d in &blkio.throttle_read_iops_device { 433 | write_file(dir, "blkio.throttle.read_iops_device", &rate(d))?; 434 | } 435 | for d in &blkio.throttle_write_iops_device { 436 | write_file(dir, "blkio.throttle.write_iops_device", &rate(d))?; 437 | } 438 | } 439 | Ok(()) 440 | } 441 | 442 | fn pids_apply(r: &LinuxResources, dir: &str) -> Result<()> { 443 | if let Some(pids) = r.pids.as_ref() { 444 | if pids.limit > 0 { 445 | write_file(dir, "pids.max", &pids.limit.to_string())?; 446 | } else { 447 | write_file(dir, "pids.max", "max")?; 448 | }; 449 | } 450 | Ok(()) 451 | } 452 | 453 | fn net_cls_apply(r: &LinuxResources, dir: &str) -> Result<()> { 454 | if let Some(network) = r.network.as_ref() { 455 | wrnz(dir, "net_cls.classid", network.class_id)?; 456 | } 457 | Ok(()) 458 | } 459 | 460 | fn net_prio_apply(r: &LinuxResources, dir: &str) -> Result<()> { 461 | if let Some(network) = r.network.as_ref() { 462 | for p in &network.priorities { 463 | let prio = format!{"{} {}", p.name, p.priority}; 464 | write_file(dir, "net_prio.ifpriomap", &prio)?; 465 | } 466 | } 467 | Ok(()) 468 | } 469 | 470 | fn hugetlb_apply(r: &LinuxResources, dir: &str) -> Result<()> { 471 | for h in &r.hugepage_limits { 472 | let key = format!{"hugetlb.{}.limit_in_bytes", h.page_size}; 473 | write_file(dir, &key, &h.limit.to_string())?; 474 | } 475 | Ok(()) 476 | } 477 | 478 | fn write_device(d: &LinuxDeviceCgroup, dir: &str) -> Result<()> { 479 | let key = if d.allow { 480 | "devices.allow" 481 | } else { 482 | "devices.deny" 483 | }; 484 | let typ = match d.typ { 485 | LinuxDeviceType::b => "b", 486 | LinuxDeviceType::c => "c", 487 | LinuxDeviceType::a => "a", 488 | _ => { 489 | let msg = "invalid cgroup device type".to_string(); 490 | bail!(ErrorKind::InvalidSpec(msg)); 491 | } 492 | }; 493 | let major = if let Some(x) = d.major { 494 | x.to_string() 495 | } else { 496 | "*".to_string() 497 | }; 498 | let minor = if let Some(x) = d.minor { 499 | x.to_string() 500 | } else { 501 | "*".to_string() 502 | }; 503 | let val = format!{"{} {}:{} {}", typ, &major, &minor, &d.access}; 504 | write_file(dir, key, &val) 505 | } 506 | 507 | fn devices_apply(r: &LinuxResources, dir: &str) -> Result<()> { 508 | for d in &r.devices { 509 | write_device(d, dir)?; 510 | } 511 | for d in super::DEFAULT_DEVICES.iter() { 512 | let ld = LinuxDeviceCgroup { 513 | allow: true, 514 | typ: d.typ, 515 | major: Some(d.major as i64), 516 | minor: Some(d.minor as i64), 517 | access: "rwm".to_string(), 518 | }; 519 | 520 | write_device(&ld, dir)?; 521 | } 522 | for ld in DEFAULT_ALLOWED_DEVICES.iter() { 523 | write_device(ld, dir)?; 524 | } 525 | 526 | Ok(()) 527 | } 528 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | error_chain! { 2 | types { 3 | Error, ErrorKind, ResultExt, Result; 4 | } 5 | foreign_links { 6 | Nix(::nix::Error); 7 | Io(::std::io::Error); 8 | Ffi(::std::ffi::NulError); 9 | Caps(::caps::errors::Error); 10 | } 11 | errors { 12 | InvalidSpec(t: String) { 13 | description("invalid spec") 14 | display("invalid spec: '{}'", t) 15 | } 16 | SeccompError(t: String) { 17 | description("seccomp error") 18 | display("seccomp error: '{}'", t) 19 | } 20 | Timeout(timeout: i32) { 21 | description("timeout") 22 | display("timeout after {} milliseconds", timeout) 23 | } 24 | PipeClosed(t: String) { 25 | description("pipe closed") 26 | display("pipe closed: '{}'", t) 27 | } 28 | InvalidValue(t: String) { 29 | description("invalid value") 30 | display("invalid value: '{}'", t) 31 | } 32 | InvalidHook(t: String) { 33 | description("invalid hook") 34 | display("invalid hook: '{}'", t) 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/logger.rs: -------------------------------------------------------------------------------- 1 | use log::{Level, Log, Metadata, Record}; 2 | 3 | use std::io::{stderr, Write}; 4 | 5 | pub struct SimpleLogger; 6 | 7 | pub static SIMPLE_LOGGER: SimpleLogger = SimpleLogger; 8 | 9 | impl Log for SimpleLogger { 10 | fn enabled(&self, metadata: &Metadata) -> bool { 11 | metadata.level() <= Level::Debug 12 | } 13 | 14 | fn log(&self, record: &Record) { 15 | if self.enabled(record.metadata()) { 16 | let _ = writeln!( 17 | &mut stderr(), 18 | "{} - {}", 19 | record.level(), 20 | record.args() 21 | ); 22 | } 23 | } 24 | 25 | fn flush(&self) { 26 | stderr().flush().expect("Failed to flush"); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(unknown_lints)] 2 | #![recursion_limit = "1024"] 3 | #![cfg_attr(feature = "nightly", feature(start))] 4 | #![cfg_attr(feature = "nightly", feature(alloc_system))] 5 | #[cfg(feature = "nightly")] 6 | extern crate alloc_system; 7 | 8 | extern crate caps; 9 | #[macro_use] 10 | extern crate clap; 11 | #[macro_use] 12 | extern crate error_chain; 13 | #[macro_use] 14 | extern crate lazy_static; 15 | extern crate libc; 16 | #[macro_use] 17 | extern crate log; 18 | extern crate nix; 19 | extern crate num_traits; 20 | extern crate prctl; 21 | #[macro_use] 22 | extern crate scopeguard; 23 | extern crate oci; 24 | extern crate seccomp_sys; 25 | 26 | mod capabilities; 27 | mod cgroups; 28 | mod errors; 29 | mod logger; 30 | mod mounts; 31 | mod nix_ext; 32 | mod seccomp; 33 | mod selinux; 34 | mod signals; 35 | mod sync; 36 | 37 | use clap::{App, AppSettings, Arg, ArgMatches, SubCommand}; 38 | use errors::*; 39 | use lazy_static::initialize; 40 | use nix::errno::Errno; 41 | use nix::fcntl::{open, OFlag}; 42 | use nix::poll::{poll, EventFlags, PollFd}; 43 | use nix::sched::{setns, unshare, CloneFlags}; 44 | use nix::sys::signal::{SigSet, Signal}; 45 | use nix::sys::socket::{accept, bind, connect, listen, sendmsg, socket}; 46 | use nix::sys::socket::{AddressFamily, SockAddr, SockFlag, SockType, UnixAddr}; 47 | use nix::sys::socket::{ControlMessage, MsgFlags}; 48 | use nix::sys::stat::{fstat, Mode}; 49 | use nix::sys::wait::WaitPidFlag; 50 | use nix::sys::wait::{waitpid, WaitStatus}; 51 | use nix::unistd::{chdir, execvp, getpid, sethostname, setresgid, setresuid}; 52 | use nix::unistd::{close, dup2, fork, pipe2, read, setsid, write, ForkResult}; 53 | use nix::unistd::{Gid, Pid, Uid}; 54 | use nix_ext::{clearenv, putenv, setgroups, setrlimit}; 55 | use oci::{Linux, LinuxIDMapping, LinuxRlimit, Spec}; 56 | use oci::{LinuxDevice, LinuxDeviceType}; 57 | use std::collections::HashMap; 58 | use std::ffi::CString; 59 | use std::fs::{canonicalize, create_dir, create_dir_all, remove_dir_all, File}; 60 | use std::io::{Read, Write}; 61 | use std::os::unix::fs::symlink; 62 | use std::os::unix::io::{FromRawFd, RawFd}; 63 | use std::result::Result as StdResult; 64 | use sync::Cond; 65 | 66 | lazy_static! { 67 | static ref DEFAULT_DEVICES: Vec = { 68 | let mut v = Vec::new(); 69 | v.push(LinuxDevice { 70 | path: "/dev/null".to_string(), 71 | typ: LinuxDeviceType::c, 72 | major: 1, 73 | minor: 3, 74 | file_mode: Some(0o066), 75 | uid: None, 76 | gid: None, 77 | }); 78 | v.push(LinuxDevice { 79 | path: "/dev/zero".to_string(), 80 | typ: LinuxDeviceType::c, 81 | major: 1, 82 | minor: 5, 83 | file_mode: Some(0o066), 84 | uid: None, 85 | gid: None, 86 | }); 87 | v.push(LinuxDevice { 88 | path: "/dev/full".to_string(), 89 | typ: LinuxDeviceType::c, 90 | major: 1, 91 | minor: 7, 92 | file_mode: Some(0o066), 93 | uid: None, 94 | gid: None, 95 | }); 96 | v.push(LinuxDevice { 97 | path: "/dev/tty".to_string(), 98 | typ: LinuxDeviceType::c, 99 | major: 5, 100 | minor: 0, 101 | file_mode: Some(0o066), 102 | uid: None, 103 | gid: None, 104 | }); 105 | v.push(LinuxDevice { 106 | path: "/dev/urandom".to_string(), 107 | typ: LinuxDeviceType::c, 108 | major: 1, 109 | minor: 9, 110 | file_mode: Some(0o066), 111 | uid: None, 112 | gid: None, 113 | }); 114 | v.push(LinuxDevice { 115 | path: "/dev/random".to_string(), 116 | typ: LinuxDeviceType::c, 117 | major: 1, 118 | minor: 8, 119 | file_mode: Some(0o066), 120 | uid: None, 121 | gid: None, 122 | }); 123 | v 124 | }; 125 | } 126 | 127 | lazy_static! { 128 | static ref NAMESPACES: HashMap = { 129 | let mut result = HashMap::new(); 130 | result.insert(CloneFlags::CLONE_NEWIPC, "ipc"); 131 | result.insert(CloneFlags::CLONE_NEWUTS, "uts"); 132 | result.insert(CloneFlags::CLONE_NEWNET, "net"); 133 | result.insert(CloneFlags::CLONE_NEWPID, "pid"); 134 | result.insert(CloneFlags::CLONE_NEWNS, "mnt"); 135 | result.insert(CloneFlags::CLONE_NEWCGROUP, "cgroup"); 136 | result.insert(CloneFlags::CLONE_NEWUSER, "user"); 137 | result 138 | }; 139 | } 140 | 141 | const CONFIG: &'static str = "config.json"; 142 | const INIT_PID: &'static str = "init.pid"; 143 | const PROCESS_PID: &'static str = "process.pid"; 144 | const TSOCKETFD: RawFd = 9; 145 | 146 | #[cfg(feature = "nightly")] 147 | static mut ARGC: isize = 0 as isize; 148 | #[cfg(feature = "nightly")] 149 | static mut ARGV: *mut *mut i8 = 0 as *mut *mut i8; 150 | 151 | // using start instead of main to get direct access to arg0 152 | #[cfg(feature = "nightly")] 153 | #[start] 154 | fn start(argc: isize, argv: *const *const u8) -> isize { 155 | unsafe { 156 | // store args so we can access them later 157 | ARGC = argc; 158 | ARGV = argv as *mut *mut i8; 159 | } 160 | 161 | // enable stack unwinding 162 | if std::panic::catch_unwind(main).is_err() { 163 | 101 164 | } else { 165 | 0 166 | } 167 | } 168 | 169 | // only show backtrace in debug mode 170 | #[cfg(not(debug_assertions))] 171 | fn print_backtrace(_: &Error) {} 172 | 173 | #[cfg(debug_assertions)] 174 | fn print_backtrace(e: &Error) { 175 | match e.backtrace() { 176 | Some(backtrace) => error!("{:?}", backtrace), 177 | None => error!("to view backtrace, use RUST_BACKTRACE=1"), 178 | } 179 | } 180 | 181 | #[cfg(feature = "nightly")] 182 | fn get_args() -> Vec { 183 | // we parse args directly since we didn't call the runtime 184 | // lang_start() function to parse them. 185 | let mut args = Vec::new(); 186 | unsafe { 187 | for i in 0..ARGC { 188 | let cstr = std::ffi::CStr::from_ptr(*ARGV.offset(i)); 189 | args.push(cstr.to_string_lossy().into_owned()); 190 | } 191 | } 192 | args 193 | } 194 | 195 | #[cfg(not(feature = "nightly"))] 196 | fn get_args() -> Vec { 197 | std::env::args().collect() 198 | } 199 | 200 | fn main() { 201 | std::env::set_var("RUST_BACKTRACE", "1"); 202 | if let Err(ref e) = run() { 203 | error!("{}", e); 204 | 205 | for e in e.iter().skip(1) { 206 | error!("caused by: {}", e); 207 | } 208 | 209 | print_backtrace(e); 210 | ::std::process::exit(1); 211 | } 212 | ::std::process::exit(0); 213 | } 214 | 215 | #[allow(needless_pass_by_value)] 216 | fn id_validator(val: String) -> StdResult<(), String> { 217 | if val.contains("..") || val.contains('/') { 218 | return Err(format!("id {} may cannot contain '..' or '/'", val)); 219 | } 220 | Ok(()) 221 | } 222 | 223 | fn run() -> Result<()> { 224 | let id_arg = Arg::with_name("id") 225 | .required(true) 226 | .takes_value(true) 227 | .validator(id_validator) 228 | .help("Unique identifier"); 229 | let bundle_arg = Arg::with_name("bundle") 230 | .required(true) 231 | .default_value(".") 232 | .long("bundle") 233 | .short("b") 234 | .help("Directory containing config.json"); 235 | let pid_arg = Arg::with_name("p") 236 | .takes_value(true) 237 | .long("pid-file") 238 | .short("p") 239 | .help("Additional location to write pid"); 240 | let init_arg = Arg::with_name("n") 241 | .help("Do not create an init process") 242 | .long("no-init") 243 | .short("n"); 244 | 245 | let matches = App::new("Railcar") 246 | .about("Railcar - run a container from an oci-runtime spec file") 247 | .setting(AppSettings::ColoredHelp) 248 | .author(crate_authors!("\n")) 249 | .setting(AppSettings::SubcommandRequired) 250 | .version(crate_version!()) 251 | .arg( 252 | Arg::with_name("v") 253 | .multiple(true) 254 | .help("Sets the level of verbosity") 255 | .short("v"), 256 | ) 257 | .arg( 258 | Arg::with_name("d") 259 | .help("Daemonize the process") 260 | .long("daemonize") 261 | .short("d"), 262 | ) 263 | .arg( 264 | Arg::with_name("log") 265 | .help("Compatibility (ignored)") 266 | .long("log") 267 | .takes_value(true), 268 | ) 269 | .arg( 270 | Arg::with_name("log-format") 271 | .help("Compatibility (ignored)") 272 | .long("log-format") 273 | .takes_value(true), 274 | ) 275 | .arg( 276 | Arg::with_name("r") 277 | .default_value("/run/railcar") 278 | .help("Dir for state") 279 | .long("root") 280 | .short("r") 281 | .takes_value(true), 282 | ) 283 | .subcommand( 284 | SubCommand::with_name("run") 285 | .setting(AppSettings::ColoredHelp) 286 | .arg(&id_arg) 287 | .arg(&bundle_arg) 288 | .arg(&pid_arg) 289 | .arg(&init_arg) 290 | .about("Run a container"), 291 | ) 292 | .subcommand( 293 | SubCommand::with_name("create") 294 | .setting(AppSettings::ColoredHelp) 295 | .arg(&id_arg) 296 | .arg(&bundle_arg) 297 | .arg(&pid_arg) 298 | .arg(&init_arg) 299 | // NOTE(vish): if no-trigger is specified, console 300 | // and console-socket will be loaded 301 | // by start instead of create, so 302 | // no output will appear from the init 303 | // process. 304 | .arg( 305 | Arg::with_name("t") 306 | .help("Double fork instead of trigger") 307 | .long("no-trigger") 308 | .short("t"), 309 | ) 310 | .arg( 311 | Arg::with_name("c") 312 | .help("Console to use") 313 | .long("console") 314 | .short("c") 315 | .takes_value(true), 316 | ) 317 | .arg( 318 | Arg::with_name("console-socket") 319 | .help("socket to pass master of console") 320 | .long("console-socket") 321 | .takes_value(true), 322 | ) 323 | .about("Create a container (to be started later)"), 324 | ) 325 | .subcommand( 326 | SubCommand::with_name("start") 327 | .setting(AppSettings::ColoredHelp) 328 | .arg(&id_arg) 329 | .about("Start a (previously created) container"), 330 | ) 331 | .subcommand( 332 | SubCommand::with_name("state") 333 | .setting(AppSettings::ColoredHelp) 334 | .arg(&id_arg) 335 | .about( 336 | "Get the (json) state of a (previously created) container", 337 | ), 338 | ) 339 | .subcommand( 340 | SubCommand::with_name("kill") 341 | .setting(AppSettings::ColoredHelp) 342 | .arg(&id_arg) 343 | .arg( 344 | Arg::with_name("a") 345 | .help("Compatibility (ignored)") 346 | .long("all") 347 | .short("a"), 348 | ) 349 | .arg( 350 | Arg::with_name("signal") 351 | .default_value("TERM") 352 | .required(true) 353 | .takes_value(true) 354 | .help("Signal to send to container"), 355 | ) 356 | .about("Signal a (previously created) container"), 357 | ) 358 | .subcommand( 359 | SubCommand::with_name("delete") 360 | .setting(AppSettings::ColoredHelp) 361 | .arg(&id_arg) 362 | .arg( 363 | Arg::with_name("f") 364 | .help("Kill process if still running") 365 | .long("force") 366 | .short("f"), 367 | ) 368 | .about("Delete a (previously created) container"), 369 | ) 370 | .subcommand( 371 | SubCommand::with_name("ps") 372 | .setting(AppSettings::ColoredHelp) 373 | .arg(&id_arg) 374 | .arg( 375 | Arg::with_name("f") 376 | .help("Compatibility (ignored)") 377 | .long("format") 378 | .short("f") 379 | .takes_value(true), 380 | ) 381 | .about("List processes in a (previously created) container"), 382 | ) 383 | .get_matches_from(get_args()); 384 | let level = match matches.occurrences_of("v") { 385 | 0 => log::LevelFilter::Info, //default 386 | 1 => log::LevelFilter::Debug, 387 | _ => log::LevelFilter::Trace, 388 | }; 389 | 390 | let _ = log::set_logger(&logger::SIMPLE_LOGGER) 391 | .map(|()| log::set_max_level(level)); 392 | 393 | // create empty log file to avoid warning 394 | let lpath = matches.value_of("log").unwrap_or_default(); 395 | if lpath != "" { 396 | std::fs::OpenOptions::new() 397 | .create(true) 398 | .write(true) 399 | .open(lpath)?; 400 | } 401 | 402 | let state_dir = matches.value_of("r").unwrap().to_string(); 403 | debug!("ensuring railcar state dir {}", &state_dir); 404 | let chain = || format!("ensuring railcar state dir {} failed", &state_dir); 405 | create_dir_all(&state_dir).chain_err(chain)?; 406 | 407 | match matches.subcommand() { 408 | ("create", Some(create_matches)) => cmd_create( 409 | create_matches.value_of("id").unwrap(), 410 | &state_dir, 411 | create_matches, 412 | ), 413 | ("delete", Some(delete_matches)) => cmd_delete( 414 | delete_matches.value_of("id").unwrap(), 415 | &state_dir, 416 | delete_matches, 417 | ), 418 | ("kill", Some(kill_matches)) => cmd_kill( 419 | kill_matches.value_of("id").unwrap(), 420 | &state_dir, 421 | kill_matches, 422 | ), 423 | ("ps", Some(ps_matches)) => { 424 | cmd_ps(ps_matches.value_of("id").unwrap(), &state_dir) 425 | } 426 | ("run", Some(run_matches)) => { 427 | cmd_run(run_matches.value_of("id").unwrap(), run_matches) 428 | } 429 | ("start", Some(start_matches)) => { 430 | cmd_start(start_matches.value_of("id").unwrap(), &state_dir) 431 | } 432 | ("state", Some(state_matches)) => { 433 | cmd_state(state_matches.value_of("id").unwrap(), &state_dir) 434 | } 435 | // We should never reach here because clap already enforces this 436 | _ => bail!("command not recognized"), 437 | } 438 | } 439 | 440 | #[inline] 441 | fn instance_dir(id: &str, state_dir: &str) -> String { 442 | format!("{}/{}", state_dir, id) 443 | } 444 | 445 | fn state(id: &str, status: &str, pid: Pid, bundle: &str) -> oci::State { 446 | oci::State { 447 | version: "0.2.0".to_string(), 448 | id: id.to_string(), 449 | status: status.to_string(), 450 | pid: pid.into(), // TODO implement serde ser/de for Pid/Gid/.. 451 | bundle: bundle.to_string(), 452 | annotations: HashMap::new(), 453 | } 454 | } 455 | 456 | // must be in instance_dir 457 | fn get_init_pid() -> Result<(Pid)> { 458 | let mut pid = Pid::from_raw(-1); 459 | if let Ok(mut f) = File::open(INIT_PID) { 460 | let mut result = String::new(); 461 | f.read_to_string(&mut result)?; 462 | if let Ok(process_pid) = result.parse::() { 463 | pid = Pid::from_raw(process_pid); 464 | } 465 | } 466 | Ok(pid) 467 | } 468 | 469 | fn state_from_dir(id: &str, state_dir: &str) -> Result<(oci::State)> { 470 | let dir = instance_dir(id, state_dir); 471 | chdir(&*dir).chain_err(|| format!("instance {} doesn't exist", id))?; 472 | let mut status = "creating"; 473 | let mut root = String::new(); 474 | let pid = get_init_pid()?; 475 | if let Ok(spec) = Spec::load(CONFIG) { 476 | root = spec.root.path.to_owned(); 477 | status = "created"; 478 | if let Ok(mut f) = File::open(PROCESS_PID) { 479 | status = "running"; 480 | let mut result = String::new(); 481 | f.read_to_string(&mut result)?; 482 | if let Ok(process_pid) = result.parse::() { 483 | if signals::signal_process(Pid::from_raw(process_pid), None) 484 | .is_err() 485 | { 486 | status = "stopped"; 487 | } 488 | } else { 489 | // not safe to log during state because shim combines 490 | // stdout and stderr 491 | // warn!("invalid process pid: {}", result); 492 | } 493 | } else { 494 | // not safe to log during state because shim combines 495 | // stdout and stderr 496 | // warn!("could not open process pid"); 497 | } 498 | } 499 | let st = state(id, status, pid, &root); 500 | Ok(st) 501 | } 502 | 503 | fn cmd_state(id: &str, state_dir: &str) -> Result<()> { 504 | debug!("Performing state"); 505 | let st = state_from_dir(id, state_dir)?; 506 | println!("{}", st.to_string().chain_err(|| "invalid state")?); 507 | Ok(()) 508 | } 509 | 510 | fn cmd_create(id: &str, state_dir: &str, matches: &ArgMatches) -> Result<()> { 511 | debug!("Performing create"); 512 | let bundle = matches.value_of("bundle").unwrap(); 513 | chdir(&*bundle).chain_err(|| format!("failed to chdir to {}", bundle))?; 514 | let dir = instance_dir(id, state_dir); 515 | debug!("creating state dir {}", &dir); 516 | if let Err(e) = create_dir(&dir) { 517 | if e.kind() != std::io::ErrorKind::AlreadyExists { 518 | let chain = || format!("creating state dir {} failed", &dir); 519 | Err(e).chain_err(chain)?; 520 | } 521 | bail!("Container with id {} already exists", id); 522 | } 523 | if let Err(e) = finish_create(id, &dir, matches) { 524 | let _ = remove_dir_all(&dir); 525 | Err(e) 526 | } else { 527 | Ok(()) 528 | } 529 | } 530 | 531 | fn load_console_sockets() -> Result<(RawFd, RawFd)> { 532 | let csocket = "console-socket"; 533 | let mut csocketfd = socket( 534 | AddressFamily::Unix, 535 | SockType::Stream, 536 | SockFlag::empty(), 537 | None, 538 | )?; 539 | csocketfd = 540 | match connect(csocketfd, &SockAddr::Unix(UnixAddr::new(&*csocket)?)) { 541 | Err(e) => { 542 | if e != ::nix::Error::Sys(Errno::ENOENT) { 543 | let msg = format!("failed to open {}", csocket); 544 | return Err(e).chain_err(|| msg)?; 545 | } 546 | -1 547 | } 548 | Ok(()) => csocketfd, 549 | }; 550 | let console = "console"; 551 | let consolefd = 552 | match open(&*console, OFlag::O_NOCTTY | OFlag::O_RDWR, Mode::empty()) { 553 | Err(e) => { 554 | if e != ::nix::Error::Sys(Errno::ENOENT) { 555 | let msg = format!("failed to open {}", console); 556 | return Err(e).chain_err(|| msg)?; 557 | } 558 | -1 559 | } 560 | Ok(fd) => fd, 561 | }; 562 | Ok((csocketfd, consolefd)) 563 | } 564 | 565 | fn finish_create(id: &str, dir: &str, matches: &ArgMatches) -> Result<()> { 566 | let spec = 567 | Spec::load(CONFIG).chain_err(|| format!("failed to load {}", CONFIG))?; 568 | 569 | let rootfs = canonicalize(&spec.root.path) 570 | .chain_err(|| format!{"failed to find root path {}", &spec.root.path})? 571 | .to_string_lossy() 572 | .into_owned(); 573 | 574 | chdir(&*dir).chain_err(|| format!("failed to chdir to {}", &dir))?; 575 | // NOTE: There are certain configs where we will not be able to create a 576 | // console during start, so this could potentially create the 577 | // console during init and pass to the process via sendmsg. This 578 | // would also allow us to write debug data from the init process 579 | // to the console and allow us to pass stdoutio from init to the 580 | // process, fixing the lack of stdout collection if -t is not 581 | // specified when using docker run. 582 | let csocket = matches.value_of("console-socket").unwrap_or_default(); 583 | if csocket != "" { 584 | let lnk = format!("{}/console-socket", dir); 585 | symlink(&csocket, lnk)?; 586 | } 587 | // symlink the console 588 | let cons = matches.value_of("c").unwrap_or_default(); 589 | if cons != "" { 590 | let lnk = format!("{}/console", dir); 591 | symlink(&cons, lnk)?; 592 | } 593 | let (csocketfd, consolefd, tsocketfd) = if !matches.is_present("t") { 594 | let tsocket = "trigger-socket"; 595 | let tmpfd = socket( 596 | AddressFamily::Unix, 597 | SockType::Stream, 598 | SockFlag::empty(), 599 | None, 600 | )?; 601 | // NOTE(vish): we might overwrite fds 0, 1, 2 with the console 602 | // so make sure tsocketfd is a high fd that won't 603 | // get overwritten 604 | dup2(tmpfd, TSOCKETFD).chain_err(|| "could not dup tsocketfd")?; 605 | close(tmpfd).chain_err(|| "could not close tsocket tmpfd")?; 606 | let tsocketfd = TSOCKETFD; 607 | bind(tsocketfd, &SockAddr::Unix(UnixAddr::new(&*tsocket)?))?; 608 | let (csocketfd, consolefd) = load_console_sockets()?; 609 | (csocketfd, consolefd, tsocketfd) 610 | } else { 611 | (-1, -1, -1) 612 | }; 613 | 614 | let pidfile = matches.value_of("p").unwrap_or_default(); 615 | 616 | let child_pid = safe_run_container( 617 | id, 618 | &rootfs, 619 | &spec, 620 | Pid::from_raw(-1), 621 | true, 622 | true, 623 | true, 624 | csocketfd, 625 | consolefd, 626 | tsocketfd, 627 | )?; 628 | if child_pid != Pid::from_raw(-1) { 629 | debug!("writing init pid file {}", child_pid); 630 | let mut f = File::create(INIT_PID)?; 631 | f.write_all(child_pid.to_string().as_bytes())?; 632 | if pidfile != "" { 633 | debug!("writing process {} pid to file {}", child_pid, pidfile); 634 | let mut f = File::create(pidfile)?; 635 | f.write_all(child_pid.to_string().as_bytes())?; 636 | } 637 | let linux = spec.linux.as_ref().unwrap(); 638 | // update namespaces to enter only 639 | let mut namespaces = Vec::new(); 640 | for ns in &linux.namespaces { 641 | let space = CloneFlags::from_bits_truncate(ns.typ as i32); 642 | if let Some(name) = NAMESPACES.get(&space) { 643 | let path = format!("/proc/{}/ns/{}", child_pid, name); 644 | let n = oci::LinuxNamespace { 645 | typ: ns.typ, 646 | path: path, 647 | }; 648 | namespaces.push(n); 649 | } 650 | } 651 | let updated_linux = oci::Linux { 652 | uid_mappings: linux.uid_mappings.clone(), 653 | gid_mappings: linux.gid_mappings.clone(), 654 | sysctl: HashMap::new(), 655 | resources: None, 656 | cgroups_path: linux.cgroups_path.to_owned(), 657 | namespaces: namespaces, 658 | devices: Vec::new(), 659 | seccomp: None, 660 | rootfs_propagation: "".to_string(), 661 | masked_paths: Vec::new(), 662 | readonly_paths: Vec::new(), 663 | mount_label: "".to_string(), 664 | }; 665 | let updated = Spec { 666 | version: spec.version, 667 | platform: spec.platform, 668 | process: spec.process, 669 | root: oci::Root { 670 | path: rootfs, 671 | readonly: spec.root.readonly, 672 | }, 673 | hostname: "".to_string(), // hostname not needed 674 | mounts: Vec::new(), // remove mounts 675 | hooks: spec.hooks, 676 | annotations: spec.annotations, 677 | linux: Some(updated_linux), 678 | solaris: spec.solaris, 679 | windows: spec.windows, 680 | }; 681 | debug!("writing updated config"); 682 | updated 683 | .save(CONFIG) 684 | .chain_err(|| format!("failed to save {}", CONFIG))?; 685 | } 686 | Ok(()) 687 | } 688 | 689 | fn cmd_start(id: &str, state_dir: &str) -> Result<()> { 690 | debug!("Performing start"); 691 | 692 | // we use instance dir for config written out by create 693 | let dir = instance_dir(id, state_dir); 694 | chdir(&*dir).chain_err(|| format!("instance {} doesn't exist", id))?; 695 | 696 | let spec = 697 | Spec::load(CONFIG).chain_err(|| format!("failed to load {}", CONFIG))?; 698 | 699 | let init_pid = get_init_pid()?; 700 | 701 | let tsocket = "trigger-socket"; 702 | let mut tsocketfd = socket( 703 | AddressFamily::Unix, 704 | SockType::Stream, 705 | SockFlag::empty(), 706 | None, 707 | )?; 708 | tsocketfd = 709 | match connect(tsocketfd, &SockAddr::Unix(UnixAddr::new(&*tsocket)?)) { 710 | Err(e) => { 711 | if e != ::nix::Error::Sys(Errno::ENOENT) { 712 | let msg = format!("failed to open {}", tsocket); 713 | return Err(e).chain_err(|| msg)?; 714 | } 715 | -1 716 | } 717 | Ok(()) => tsocketfd, 718 | }; 719 | 720 | // if we are triggering just trigger and exit 721 | if tsocketfd != -1 { 722 | debug!("running prestart hooks"); 723 | if let Some(ref hooks) = spec.hooks { 724 | let st = state(id, "running", init_pid, &spec.root.path); 725 | for h in &hooks.prestart { 726 | execute_hook(h, &st) 727 | .chain_err(|| "failed to execute prestart hooks")?; 728 | } 729 | } 730 | let linux = spec.linux.as_ref().unwrap(); 731 | let cpath = if linux.cgroups_path == "" { 732 | format!{"/{}", id} 733 | } else { 734 | linux.cgroups_path.clone() 735 | }; 736 | // get the actual pid of the process from cgroup 737 | let mut child_pid = Pid::from_raw(-1); 738 | let procs = cgroups::get_procs("cpuset", &cpath); 739 | for p in procs { 740 | if p != init_pid { 741 | debug!("actual pid of child is {}", p); 742 | child_pid = p; 743 | break; 744 | } 745 | } 746 | let mut f = File::create(PROCESS_PID)?; 747 | f.write_all(child_pid.to_string().as_bytes())?; 748 | debug!("running poststart hooks"); 749 | if let Some(ref hooks) = spec.hooks { 750 | let st = state(id, "running", init_pid, &spec.root.path); 751 | for h in &hooks.poststart { 752 | if let Err(e) = execute_hook(h, &st) { 753 | warn!("failed to execute poststart hook: {}", e); 754 | } 755 | } 756 | } 757 | debug!("writing zero to trigger socket to start exec"); 758 | let data: &[u8] = &[0]; 759 | write(tsocketfd, data).chain_err(|| "failed to write zero")?; 760 | return Ok(()); 761 | } 762 | 763 | let (csocketfd, consolefd) = load_console_sockets()?; 764 | 765 | let child_pid = safe_run_container( 766 | id, 767 | &spec.root.path, 768 | &spec, 769 | init_pid, 770 | false, 771 | false, 772 | true, 773 | csocketfd, 774 | consolefd, 775 | -1, 776 | )?; 777 | if child_pid != Pid::from_raw(-1) { 778 | debug!("writing process {} pid file", child_pid); 779 | let mut f = File::create(PROCESS_PID)?; 780 | f.write_all(child_pid.to_string().as_bytes())?; 781 | } 782 | Ok(()) 783 | } 784 | 785 | fn cmd_kill(id: &str, state_dir: &str, matches: &ArgMatches) -> Result<()> { 786 | debug!("Performing kill"); 787 | let signal = signals::to_signal(matches.value_of("signal").unwrap()) 788 | .unwrap_or(Signal::SIGTERM); 789 | let dir = instance_dir(id, state_dir); 790 | chdir(&*dir).chain_err(|| format!("instance {} doesn't exist", id))?; 791 | let mut f = File::open(INIT_PID).chain_err(|| "failed to find pid")?; 792 | let mut result = String::new(); 793 | f.read_to_string(&mut result)?; 794 | if let Ok(init_pid) = result.parse::() { 795 | if signals::signal_process(Pid::from_raw(init_pid), signal).is_err() { 796 | warn!("failed signal init process {}, may have exited", init_pid); 797 | } 798 | } else { 799 | warn!("invalid process pid: {}", result); 800 | } 801 | Ok(()) 802 | } 803 | 804 | fn cmd_ps(id: &str, state_dir: &str) -> Result<()> { 805 | debug!("Performing ps"); 806 | let dir = instance_dir(id, state_dir); 807 | chdir(&*dir).chain_err(|| format!("instance {} doesn't exist", id))?; 808 | let mut f = File::open(PROCESS_PID).chain_err(|| "failed to find pid")?; 809 | let mut result = String::new(); 810 | f.read_to_string(&mut result)?; 811 | // TODO: return any other execed processes 812 | let mut pids = Vec::new(); 813 | if let Ok(process_pid) = result.parse::() { 814 | pids.push(Pid::from_raw(process_pid)); 815 | } else { 816 | warn!("invalid process pid: {}", result); 817 | } 818 | let pids = pids 819 | .into_iter() 820 | .map(|pid: Pid| -> i32 { pid.into() }) 821 | .collect::>(); 822 | println!( 823 | "{}", 824 | oci::serialize::to_string(&pids) 825 | .chain_err(|| "could not serialize pids")? 826 | ); 827 | Ok(()) 828 | } 829 | 830 | fn cmd_delete(id: &str, state_dir: &str, matches: &ArgMatches) -> Result<()> { 831 | debug!("Performing delete"); 832 | let dir = instance_dir(id, state_dir); 833 | if chdir(&*dir).is_err() { 834 | debug!("instance {} doesn't exist", id); 835 | warn!("returning zero to work around docker bug"); 836 | return Ok(()); 837 | } 838 | if let Ok(mut f) = File::open(PROCESS_PID) { 839 | let mut result = String::new(); 840 | f.read_to_string(&mut result)?; 841 | if let Ok(process_pid) = result.parse::() { 842 | let process_pid = Pid::from_raw(process_pid); 843 | 844 | if signals::signal_process(process_pid, None).is_ok() { 845 | if matches.is_present("f") { 846 | if let Err(e) = 847 | signals::signal_process(process_pid, Signal::SIGKILL) 848 | { 849 | let chain = || { 850 | format!("failed to kill process {} ", process_pid) 851 | }; 852 | if let Error(ErrorKind::Nix(nixerr), _) = e { 853 | if nixerr == ::nix::Error::Sys(Errno::ESRCH) { 854 | debug!("container process is already dead"); 855 | } else { 856 | Err(e).chain_err(chain)?; 857 | } 858 | } else { 859 | Err(e).chain_err(chain)?; 860 | } 861 | } 862 | } else { 863 | bail!("container process {} is still running", process_pid) 864 | } 865 | } 866 | } else { 867 | warn!("invalid process pid: {}", result); 868 | } 869 | } else { 870 | debug!("process doesn't exist"); 871 | } 872 | if let Ok(mut f) = File::open(INIT_PID) { 873 | debug!("killing init process"); 874 | let mut result = String::new(); 875 | f.read_to_string(&mut result)?; 876 | if let Ok(ipid) = result.parse::() { 877 | if let Err(e) = 878 | signals::signal_process(Pid::from_raw(ipid), Signal::SIGKILL) 879 | { 880 | let chain = || format!("failed to kill init {} ", ipid); 881 | if let Error(ErrorKind::Nix(nixerr), _) = e { 882 | if let ::nix::Error::Sys(errno) = nixerr { 883 | if errno == Errno::ESRCH { 884 | debug!("init process is already dead"); 885 | } 886 | Err(e).chain_err(chain)?; 887 | } else { 888 | Err(e).chain_err(chain)?; 889 | } 890 | } else { 891 | Err(e).chain_err(chain)?; 892 | } 893 | } 894 | } else { 895 | warn!("invalid init pid: {}", result); 896 | } 897 | } else { 898 | debug!("init process doesn't exist"); 899 | } 900 | if let Ok(spec) = Spec::load(CONFIG) { 901 | let linux = spec.linux.as_ref().unwrap(); 902 | let cpath = if linux.cgroups_path == "" { 903 | format!{"/{}", id} 904 | } else { 905 | linux.cgroups_path.clone() 906 | }; 907 | debug!("removing cgroups"); 908 | if let Err(Error(ErrorKind::Io(e), _)) = cgroups::remove(&cpath) { 909 | if e.kind() != std::io::ErrorKind::NotFound { 910 | warn!("failed to remove cgroup dir: {}", e); 911 | } 912 | } 913 | debug!("running poststop hooks"); 914 | if let Some(ref hooks) = spec.hooks { 915 | let st = state_from_dir(id, state_dir)?; 916 | for h in &hooks.poststop { 917 | execute_hook(h, &st) 918 | .chain_err(|| "failed to execute poststop hooks")?; 919 | } 920 | } 921 | } else { 922 | debug!("config could not be loaded"); 923 | } 924 | debug!("removing state dir {}", &dir); 925 | if let Err(e) = remove_dir_all(&dir) { 926 | if e.kind() != std::io::ErrorKind::NotFound { 927 | let chain = || format!("removing state dir {} failed", &dir); 928 | Err(e).chain_err(chain)?; 929 | } 930 | bail!("State dir for {} disappeared", id); 931 | } 932 | 933 | Ok(()) 934 | } 935 | 936 | fn cmd_run(id: &str, matches: &ArgMatches) -> Result<()> { 937 | let bundle = matches.value_of("bundle").unwrap(); 938 | chdir(&*bundle).chain_err(|| format!("failed to chdir to {}", bundle))?; 939 | let spec = 940 | Spec::load(CONFIG).chain_err(|| format!("failed to load {}", CONFIG))?; 941 | 942 | let child_pid = safe_run_container( 943 | id, 944 | &spec.root.path, 945 | &spec, 946 | Pid::from_raw(-1), 947 | !matches.is_present("n"), 948 | false, 949 | matches.is_present("d"), 950 | -1, 951 | -1, 952 | -1, 953 | )?; 954 | info!("Container running with pid {}", child_pid); 955 | Ok(()) 956 | } 957 | 958 | fn execute_hook(hook: &oci::Hook, state: &oci::State) -> Result<()> { 959 | debug!("executing hook {:?}", hook); 960 | let (rfd, wfd) = 961 | pipe2(OFlag::O_CLOEXEC).chain_err(|| "failed to create pipe")?; 962 | match fork()? { 963 | ForkResult::Child => { 964 | close(rfd).chain_err(|| "could not close rfd")?; 965 | let (rstdin, wstdin) = 966 | pipe2(OFlag::empty()).chain_err(|| "failed to create pipe")?; 967 | // fork second child to execute hook 968 | match fork()? { 969 | ForkResult::Child => { 970 | close(0).chain_err(|| "could not close stdin")?; 971 | dup2(rstdin, 0).chain_err(|| "could not dup to stdin")?; 972 | close(rstdin).chain_err(|| "could not close rstdin")?; 973 | close(wstdin).chain_err(|| "could not close wstdin")?; 974 | do_exec(&hook.path, &hook.args, &hook.env)?; 975 | } 976 | ForkResult::Parent { child } => { 977 | close(rstdin).chain_err(|| "could not close rstdin")?; 978 | unsafe { 979 | // closes the file descriptor autmotaically 980 | state 981 | .to_writer(File::from_raw_fd(wstdin)) 982 | .chain_err(|| "could not write state")?; 983 | } 984 | let (exit_code, sig) = wait_for_child(child)?; 985 | if let Some(signal) = sig { 986 | // write signal to pipe. 987 | let data: &[u8] = &[signal as u8]; 988 | write(wfd, data) 989 | .chain_err(|| "failed to write signal hook")?; 990 | } 991 | close(wfd).chain_err(|| "could not close wfd")?; 992 | std::process::exit(exit_code as i32); 993 | } 994 | } 995 | } 996 | ForkResult::Parent { child } => { 997 | // the wfd is only used by the child so close it 998 | close(wfd).chain_err(|| "could not close wfd")?; 999 | let mut timeout = -1 as i32; 1000 | if let Some(t) = hook.timeout { 1001 | timeout = t as i32 * 1000; 1002 | } 1003 | // a timeout will cause a failure and child will be killed on exit 1004 | if let Some(sig) = wait_for_pipe_sig(rfd, timeout)? { 1005 | let msg = format!{"hook exited with signal: {:?}", sig}; 1006 | return Err(ErrorKind::InvalidHook(msg).into()); 1007 | } 1008 | let (exit_code, _) = wait_for_child(child)?; 1009 | if exit_code != 0 { 1010 | let msg = format!{"hook exited with exit code: {}", exit_code}; 1011 | return Err(ErrorKind::InvalidHook(msg).into()); 1012 | } 1013 | } 1014 | }; 1015 | Ok(()) 1016 | } 1017 | 1018 | fn safe_run_container( 1019 | id: &str, 1020 | rootfs: &str, 1021 | spec: &Spec, 1022 | init_pid: Pid, 1023 | init: bool, 1024 | init_only: bool, 1025 | daemonize: bool, 1026 | csocketfd: RawFd, 1027 | consolefd: RawFd, 1028 | tsocketfd: RawFd, 1029 | ) -> Result { 1030 | let pid = getpid(); 1031 | match run_container( 1032 | id, rootfs, spec, init_pid, init, init_only, daemonize, csocketfd, 1033 | consolefd, tsocketfd, 1034 | ) { 1035 | Err(e) => { 1036 | // if we are the top level thread, kill all children 1037 | if pid == getpid() { 1038 | signals::signal_children(Signal::SIGTERM).unwrap(); 1039 | } 1040 | Err(e) 1041 | } 1042 | Ok(child_pid) => Ok(child_pid), 1043 | } 1044 | } 1045 | 1046 | fn run_container( 1047 | id: &str, 1048 | rootfs: &str, 1049 | spec: &Spec, 1050 | init_pid: Pid, 1051 | mut init: bool, 1052 | mut init_only: bool, 1053 | daemonize: bool, 1054 | csocketfd: RawFd, 1055 | mut consolefd: RawFd, 1056 | tsocketfd: RawFd, 1057 | ) -> Result { 1058 | if let Err(e) = prctl::set_dumpable(false) { 1059 | bail!(format!("set dumpable returned {}", e)); 1060 | }; 1061 | 1062 | // if selinux is disabled, set will fail so print a warning 1063 | if !spec.process.selinux_label.is_empty() { 1064 | if let Err(e) = selinux::setexeccon(&spec.process.selinux_label) { 1065 | warn!( 1066 | "could not set label to {}: {}", 1067 | spec.process.selinux_label, e 1068 | ); 1069 | }; 1070 | } 1071 | 1072 | if spec.linux.is_none() { 1073 | let msg = "linux config is empty".to_string(); 1074 | return Err(ErrorKind::InvalidSpec(msg).into()); 1075 | } 1076 | 1077 | let linux = spec.linux.as_ref().unwrap(); 1078 | 1079 | // initialize static variables before forking 1080 | initialize(&DEFAULT_DEVICES); 1081 | initialize(&NAMESPACES); 1082 | cgroups::init(); 1083 | 1084 | // collect namespaces 1085 | let mut cf = CloneFlags::empty(); 1086 | let mut to_enter = Vec::new(); 1087 | let mut enter_pid = false; 1088 | for ns in &linux.namespaces { 1089 | let space = CloneFlags::from_bits_truncate(ns.typ as i32); 1090 | if space == CloneFlags::CLONE_NEWPID { 1091 | enter_pid = true; 1092 | } 1093 | if ns.path.is_empty() { 1094 | cf |= space; 1095 | } else { 1096 | let fd = open(&*ns.path, OFlag::empty(), Mode::empty()) 1097 | .chain_err(|| format!("failed to open file for {:?}", space))?; 1098 | to_enter.push((space, fd)); 1099 | } 1100 | } 1101 | if !enter_pid { 1102 | init = false; 1103 | init_only = false; 1104 | } 1105 | 1106 | let cpath = if linux.cgroups_path == "" { 1107 | format!{"/{}", id} 1108 | } else { 1109 | linux.cgroups_path.clone() 1110 | }; 1111 | 1112 | let mut bind_devices = false; 1113 | let mut userns = false; 1114 | let rlimits = &spec.process.rlimits; 1115 | // fork for userns and cgroups 1116 | if cf.contains(CloneFlags::CLONE_NEWUSER) { 1117 | bind_devices = true; 1118 | userns = true; 1119 | } 1120 | 1121 | if !daemonize { 1122 | if let Err(e) = prctl::set_child_subreaper(true) { 1123 | bail!(format!("set subreaper returned {}", e)); 1124 | }; 1125 | } 1126 | let (child_pid, wfd) = fork_first( 1127 | id, init_pid, enter_pid, init_only, daemonize, userns, linux, rlimits, 1128 | &cpath, spec, 1129 | )?; 1130 | 1131 | // parent returns child pid and exits 1132 | if child_pid != Pid::from_raw(-1) { 1133 | return Ok(child_pid); 1134 | } 1135 | 1136 | let mut mount_fd = -1; 1137 | // enter path namespaces 1138 | for &(space, fd) in &to_enter { 1139 | if space == CloneFlags::CLONE_NEWNS { 1140 | // enter mount ns last 1141 | mount_fd = fd; 1142 | continue; 1143 | } 1144 | setns(fd, space).chain_err(|| format!("failed to enter {:?}", space))?; 1145 | close(fd)?; 1146 | if space == CloneFlags::CLONE_NEWUSER { 1147 | setid(Uid::from_raw(0), Gid::from_raw(0)) 1148 | .chain_err(|| "failed to setid")?; 1149 | bind_devices = true; 1150 | } 1151 | } 1152 | 1153 | // TODO: handle systemd-style cgroup_path 1154 | if !cpath.starts_with('/') { 1155 | let msg = "cgroup path must be absolute".to_string(); 1156 | return Err(ErrorKind::InvalidSpec(msg).into()); 1157 | } 1158 | 1159 | // unshare other ns 1160 | let chain = || format!("failed to unshare {:?}", cf); 1161 | unshare(cf & !CloneFlags::CLONE_NEWUSER).chain_err(chain)?; 1162 | 1163 | if enter_pid { 1164 | fork_enter_pid(init, daemonize)?; 1165 | }; 1166 | 1167 | if cf.contains(CloneFlags::CLONE_NEWUTS) { 1168 | sethostname(&spec.hostname)?; 1169 | } 1170 | 1171 | if cf.contains(CloneFlags::CLONE_NEWNS) { 1172 | mounts::init_rootfs(spec, rootfs, &cpath, bind_devices) 1173 | .chain_err(|| "failed to init rootfs")?; 1174 | } 1175 | 1176 | if !init_only { 1177 | // notify first parent that it can continue 1178 | debug!("writing zero to pipe to trigger prestart"); 1179 | let data: &[u8] = &[0]; 1180 | write(wfd, data).chain_err(|| "failed to write zero")?; 1181 | } 1182 | 1183 | if mount_fd != -1 { 1184 | setns(mount_fd, CloneFlags::CLONE_NEWNS).chain_err(|| { 1185 | "failed to enter CloneFlags::CLONE_NEWNS".to_string() 1186 | })?; 1187 | close(mount_fd)?; 1188 | } 1189 | 1190 | if cf.contains(CloneFlags::CLONE_NEWNS) { 1191 | mounts::pivot_rootfs(&*rootfs).chain_err(|| "failed to pivot rootfs")?; 1192 | 1193 | // only set sysctls in newns 1194 | for (key, value) in &linux.sysctl { 1195 | set_sysctl(key, value)?; 1196 | } 1197 | 1198 | // NOTE: apparently criu has problems if pointing to an fd outside 1199 | // the filesystem namespace. 1200 | reopen_dev_null()?; 1201 | } 1202 | 1203 | if csocketfd != -1 { 1204 | let mut slave: libc::c_int = unsafe { std::mem::uninitialized() }; 1205 | let mut master: libc::c_int = unsafe { std::mem::uninitialized() }; 1206 | let ret = unsafe { 1207 | libc::openpty( 1208 | &mut master, 1209 | &mut slave, 1210 | std::ptr::null_mut(), 1211 | std::ptr::null_mut(), 1212 | std::ptr::null_mut(), 1213 | ) 1214 | }; 1215 | Errno::result(ret).chain_err(|| "could not openpty")?; 1216 | defer!(close(master).unwrap()); 1217 | let data: &[u8] = b"/dev/ptmx"; 1218 | let iov = [nix::sys::uio::IoVec::from_slice(data)]; 1219 | //let fds = [master.as_raw_fd()]; 1220 | let fds = [master]; 1221 | let cmsg = ControlMessage::ScmRights(&fds); 1222 | sendmsg(csocketfd, &iov, &[cmsg], MsgFlags::empty(), None)?; 1223 | consolefd = slave; 1224 | close(csocketfd).chain_err(|| "could not close csocketfd")?; 1225 | } 1226 | if consolefd != -1 { 1227 | setsid()?; 1228 | if unsafe { libc::ioctl(consolefd, libc::TIOCSCTTY) } < 0 { 1229 | warn!("could not TIOCSCTTY"); 1230 | }; 1231 | dup2(consolefd, 0).chain_err(|| "could not dup tty to stdin")?; 1232 | dup2(consolefd, 1).chain_err(|| "could not dup tty to stdout")?; 1233 | dup2(consolefd, 2).chain_err(|| "could not dup tty to stderr")?; 1234 | 1235 | if consolefd > 2 { 1236 | close(consolefd).chain_err(|| "could not close consolefd")?; 1237 | } 1238 | 1239 | // NOTE: we may need to fix up the mount of /dev/console 1240 | } 1241 | 1242 | if cf.contains(CloneFlags::CLONE_NEWNS) { 1243 | mounts::finish_rootfs(spec).chain_err(|| "failed to finish rootfs")?; 1244 | } 1245 | 1246 | // change to specified working directory 1247 | if !spec.process.cwd.is_empty() { 1248 | chdir(&*spec.process.cwd)?; 1249 | } 1250 | 1251 | debug!("setting ids"); 1252 | 1253 | // set uid/gid/groups 1254 | let uid = Uid::from_raw(spec.process.user.uid); 1255 | let gid = Gid::from_raw(spec.process.user.gid); 1256 | setid(uid, gid)?; 1257 | if !spec.process.user.additional_gids.is_empty() { 1258 | setgroups(&spec.process.user.additional_gids)?; 1259 | } 1260 | 1261 | // NOTE: if we want init to pass signals to other processes, we may want 1262 | // to hold on to cap kill until after the final fork. 1263 | if spec.process.no_new_privileges { 1264 | if let Err(e) = prctl::set_no_new_privileges(true) { 1265 | bail!(format!("set no_new_privs returned {}", e)); 1266 | }; 1267 | // drop privileges 1268 | if let Some(ref c) = spec.process.capabilities { 1269 | capabilities::drop_privileges(c)?; 1270 | } 1271 | if let Some(ref seccomp) = linux.seccomp { 1272 | seccomp::initialize_seccomp(seccomp)?; 1273 | } 1274 | } else { 1275 | // NOTE: if we have not set no new priviliges, we must set up seccomp 1276 | // before capset, which will error if seccomp blocks it 1277 | if let Some(ref seccomp) = linux.seccomp { 1278 | seccomp::initialize_seccomp(seccomp)?; 1279 | } 1280 | // drop privileges 1281 | if let Some(ref c) = spec.process.capabilities { 1282 | capabilities::drop_privileges(c)?; 1283 | } 1284 | } 1285 | 1286 | // notify first parent that it can continue 1287 | debug!("writing zero to pipe to trigger poststart"); 1288 | let data: &[u8] = &[0]; 1289 | write(wfd, data).chain_err(|| "failed to write zero")?; 1290 | 1291 | if init { 1292 | if init_only && tsocketfd == -1 { 1293 | do_init(wfd, daemonize)?; 1294 | } else { 1295 | fork_final_child(wfd, tsocketfd, daemonize)?; 1296 | } 1297 | } 1298 | 1299 | // we nolonger need wfd, so close it 1300 | close(wfd).chain_err(|| "could not close wfd")?; 1301 | 1302 | // wait for trigger 1303 | if tsocketfd != -1 { 1304 | listen(tsocketfd, 1)?; 1305 | let fd = accept(tsocketfd)?; 1306 | wait_for_pipe_zero(fd, -1)?; 1307 | close(fd).chain_err(|| "could not close accept fd")?; 1308 | close(tsocketfd).chain_err(|| "could not close trigger fd")?; 1309 | } 1310 | 1311 | do_exec(&spec.process.args[0], &spec.process.args, &spec.process.env)?; 1312 | Ok(Pid::from_raw(-1)) 1313 | } 1314 | 1315 | fn fork_first( 1316 | id: &str, 1317 | init_pid: Pid, 1318 | enter_pid: bool, 1319 | init_only: bool, 1320 | daemonize: bool, 1321 | userns: bool, 1322 | linux: &Linux, 1323 | rlimits: &[LinuxRlimit], 1324 | cpath: &str, 1325 | spec: &Spec, 1326 | ) -> Result<(Pid, RawFd)> { 1327 | let ccond = Cond::new().chain_err(|| "failed to create cond")?; 1328 | let pcond = Cond::new().chain_err(|| "failed to create cond")?; 1329 | let (rfd, wfd) = 1330 | pipe2(OFlag::O_CLOEXEC).chain_err(|| "failed to create pipe")?; 1331 | match fork()? { 1332 | ForkResult::Child => { 1333 | close(rfd).chain_err(|| "could not close rfd")?; 1334 | set_name("rc-user")?; 1335 | 1336 | // set oom_score_adj 1337 | if let Some(ref r) = linux.resources { 1338 | if let Some(adj) = r.oom_score_adj { 1339 | let mut f = File::create("/proc/self/oom_score_adj")?; 1340 | f.write_all(adj.to_string().as_bytes())?; 1341 | } 1342 | } 1343 | 1344 | // set rlimits (before entering user ns) 1345 | for rlimit in rlimits.iter() { 1346 | setrlimit(rlimit.typ as i32, rlimit.soft, rlimit.hard)?; 1347 | } 1348 | 1349 | if userns { 1350 | unshare(CloneFlags::CLONE_NEWUSER) 1351 | .chain_err(|| "failed to unshare user")?; 1352 | } 1353 | ccond.notify().chain_err(|| "failed to notify parent")?; 1354 | pcond.wait().chain_err(|| "failed to wait for parent")?; 1355 | if userns { 1356 | setid(Uid::from_raw(0), Gid::from_raw(0)) 1357 | .chain_err(|| "failed to setid")?; 1358 | } 1359 | // child continues on 1360 | } 1361 | ForkResult::Parent { child } => { 1362 | close(wfd).chain_err(|| "could not close wfd")?; 1363 | ccond.wait().chain_err(|| "failed to wait for child")?; 1364 | if userns { 1365 | // write uid/gid map 1366 | write_mappings( 1367 | &format!("/proc/{}/uid_map", child), 1368 | &linux.uid_mappings, 1369 | ).chain_err(|| "failed to write uid mappings")?; 1370 | write_mappings( 1371 | &format!("/proc/{}/gid_map", child), 1372 | &linux.gid_mappings, 1373 | ).chain_err(|| "failed to write gid mappings")?; 1374 | } 1375 | // setup cgroups 1376 | let schild = child.to_string(); 1377 | cgroups::apply(&linux.resources, &schild, cpath)?; 1378 | // notify child 1379 | pcond.notify().chain_err(|| "failed to notify child")?; 1380 | 1381 | // NOTE: if we are entering pid, we wait for the next 1382 | // child to exit so we can adopt its grandchild 1383 | if enter_pid { 1384 | let (_, _) = wait_for_child(child)?; 1385 | } 1386 | let mut pid = Pid::from_raw(-1); 1387 | wait_for_pipe_zero(rfd, -1)?; 1388 | // get the actual pid of the process from cgroup 1389 | let procs = cgroups::get_procs("cpuset", cpath); 1390 | for p in procs { 1391 | if p != init_pid { 1392 | debug!("actual pid of child is {}", p); 1393 | pid = p; 1394 | break; 1395 | } 1396 | } 1397 | if !init_only { 1398 | debug!("running prestart hooks"); 1399 | if let Some(ref hooks) = spec.hooks { 1400 | let st = state(id, "running", init_pid, &spec.root.path); 1401 | for h in &hooks.prestart { 1402 | execute_hook(h, &st) 1403 | .chain_err(|| "failed to execute prestart hooks")?; 1404 | } 1405 | } 1406 | wait_for_pipe_zero(rfd, -1)?; 1407 | debug!("running poststart hooks"); 1408 | if let Some(ref hooks) = spec.hooks { 1409 | let st = state(id, "running", init_pid, &spec.root.path); 1410 | for h in &hooks.poststart { 1411 | if let Err(e) = execute_hook(h, &st) { 1412 | warn!("failed to execute poststart hook: {}", e); 1413 | } 1414 | } 1415 | } 1416 | } 1417 | if daemonize { 1418 | debug!("first parent exiting for daemonization"); 1419 | return Ok((pid, wfd)); 1420 | } 1421 | signals::pass_signals(pid)?; 1422 | let sig = wait_for_pipe_sig(rfd, -1)?; 1423 | let (exit_code, _) = wait_for_child(pid)?; 1424 | cgroups::remove(cpath)?; 1425 | exit(exit_code as i8, sig)?; 1426 | } 1427 | }; 1428 | Ok((Pid::from_raw(-1), wfd)) 1429 | } 1430 | 1431 | fn fork_enter_pid(init: bool, daemonize: bool) -> Result<()> { 1432 | // do the first fork right away because we must fork before we can 1433 | // mount proc. The child will be in the pid namespace. 1434 | match fork()? { 1435 | ForkResult::Child => { 1436 | if init { 1437 | set_name("rc-init")?; 1438 | } else if daemonize { 1439 | // NOTE: if we are daemonizing non-init, we need an additional 1440 | // fork to allow process to be reparented to init 1441 | match fork()? { 1442 | ForkResult::Child => { 1443 | // child continues 1444 | } 1445 | ForkResult::Parent { .. } => { 1446 | debug!("third parent exiting for daemonization"); 1447 | exit(0, None)?; 1448 | } 1449 | } 1450 | } 1451 | // child continues 1452 | } 1453 | ForkResult::Parent { .. } => { 1454 | debug!("second parent exiting"); 1455 | exit(0, None)?; 1456 | } 1457 | }; 1458 | Ok(()) 1459 | } 1460 | 1461 | fn fork_final_child(wfd: RawFd, tfd: RawFd, daemonize: bool) -> Result<()> { 1462 | // fork again so child becomes pid 2 1463 | match fork()? { 1464 | ForkResult::Child => { 1465 | // child continues on 1466 | Ok(()) 1467 | } 1468 | ForkResult::Parent { .. } => { 1469 | if tfd != -1 { 1470 | close(tfd).chain_err(|| "could not close trigger fd")?; 1471 | } 1472 | do_init(wfd, daemonize)?; 1473 | Ok(()) 1474 | } 1475 | } 1476 | } 1477 | 1478 | fn do_init(wfd: RawFd, daemonize: bool) -> Result<()> { 1479 | if daemonize { 1480 | close(wfd).chain_err(|| "could not close wfd")?; 1481 | } 1482 | let s = SigSet::all(); 1483 | s.thread_block()?; 1484 | loop { 1485 | let signal = s.wait()?; 1486 | if signal == Signal::SIGCHLD { 1487 | debug!("got a sigchld"); 1488 | let mut sig = None; 1489 | let code; 1490 | match reap_children()? { 1491 | WaitStatus::Exited(_, c) => code = c as i32, 1492 | WaitStatus::Signaled(_, s, _) => { 1493 | sig = Some(s); 1494 | code = 128 + s as libc::c_int; 1495 | } 1496 | _ => continue, 1497 | }; 1498 | if !daemonize { 1499 | if let Some(s) = sig { 1500 | // raising from pid 1 doesn't work as you would 1501 | // expect, so write signal to pipe. 1502 | let data: &[u8] = &[s as u8]; 1503 | write(wfd, data).chain_err(|| "failed to write signal")?; 1504 | } 1505 | close(wfd).chain_err(|| "could not close wfd")?; 1506 | } 1507 | debug!("all children terminated, exiting with {}", code); 1508 | std::process::exit(code) 1509 | } 1510 | debug!("passing {:?} on to children", signal); 1511 | if let Err(e) = signals::signal_process(Pid::from_raw(-1), signal) { 1512 | warn!("failed to signal children, {}", e); 1513 | } 1514 | } 1515 | } 1516 | 1517 | fn do_exec(path: &str, args: &[String], env: &[String]) -> Result<()> { 1518 | let p = CString::new(path.to_string()).unwrap(); 1519 | let a: Vec = args 1520 | .iter() 1521 | .map(|s| CString::new(s.to_string()).unwrap_or_default()) 1522 | .collect(); 1523 | let env: Vec = env 1524 | .iter() 1525 | .map(|s| CString::new(s.to_string()).unwrap_or_default()) 1526 | .collect(); 1527 | // execvp doesn't use env for the search path, so we set env manually 1528 | clearenv()?; 1529 | for e in &env { 1530 | debug!("adding {:?} to env", e); 1531 | putenv(e)?; 1532 | } 1533 | execvp(&p, &a).chain_err(|| "failed to exec")?; 1534 | // should never reach here 1535 | Ok(()) 1536 | } 1537 | 1538 | fn write_mappings(path: &str, maps: &[LinuxIDMapping]) -> Result<()> { 1539 | let mut data = String::new(); 1540 | for m in maps { 1541 | let val = format!("{} {} {}\n", m.container_id, m.host_id, m.size); 1542 | data = data + &val; 1543 | } 1544 | if !data.is_empty() { 1545 | let fd = open(path, OFlag::O_WRONLY, Mode::empty())?; 1546 | defer!(close(fd).unwrap()); 1547 | write(fd, data.as_bytes())?; 1548 | } 1549 | Ok(()) 1550 | } 1551 | 1552 | fn set_sysctl(key: &str, value: &str) -> Result<()> { 1553 | let path = format!{"/proc/sys/{}", key.replace(".", "/")}; 1554 | let fd = match open(&*path, OFlag::O_RDWR, Mode::empty()) { 1555 | Err(::nix::Error::Sys(errno)) => { 1556 | if errno != Errno::ENOENT { 1557 | let msg = format!("could not set sysctl {} to {}", key, value); 1558 | Err(::nix::Error::Sys(errno)).chain_err(|| msg)?; 1559 | } 1560 | warn!("could not set {} because it doesn't exist", key); 1561 | return Ok(()); 1562 | } 1563 | Err(e) => Err(e)?, 1564 | Ok(fd) => fd, 1565 | }; 1566 | defer!(close(fd).unwrap()); 1567 | write(fd, value.as_bytes())?; 1568 | Ok(()) 1569 | } 1570 | 1571 | fn reopen_dev_null() -> Result<()> { 1572 | let null_fd = open("/dev/null", OFlag::O_WRONLY, Mode::empty())?; 1573 | let null_stat = fstat(null_fd)?; 1574 | defer!(close(null_fd).unwrap()); 1575 | for fd in 0..3 { 1576 | if let Ok(stat) = fstat(fd) { 1577 | if stat.st_rdev == null_stat.st_rdev { 1578 | if fd == 0 { 1579 | // close and reopen to get RDONLY 1580 | close(fd)?; 1581 | open("/dev/null", OFlag::O_RDONLY, Mode::empty())?; 1582 | } else { 1583 | // we already have wronly fd, so duplicate it 1584 | dup2(null_fd, fd)?; 1585 | } 1586 | } 1587 | } 1588 | } 1589 | Ok(()) 1590 | } 1591 | 1592 | fn wait_for_pipe_vec( 1593 | rfd: RawFd, 1594 | timeout: i32, 1595 | num: usize, 1596 | ) -> Result<(Vec)> { 1597 | let mut result = Vec::new(); 1598 | while result.len() < num { 1599 | let pfds = 1600 | &mut [PollFd::new(rfd, EventFlags::POLLIN | EventFlags::POLLHUP)]; 1601 | match poll(pfds, timeout) { 1602 | Err(e) => { 1603 | if e != ::nix::Error::Sys(Errno::EINTR) { 1604 | return Err(e).chain_err(|| "unable to poll rfd")?; 1605 | } 1606 | continue; 1607 | } 1608 | Ok(n) => { 1609 | if n == 0 { 1610 | return Err(ErrorKind::Timeout(timeout).into()); 1611 | } 1612 | } 1613 | } 1614 | let events = pfds[0].revents(); 1615 | if events.is_none() { 1616 | // continue on no events 1617 | continue; 1618 | } 1619 | if events.unwrap() == EventFlags::POLLNVAL { 1620 | let msg = "file descriptor closed unexpectedly".to_string(); 1621 | return Err(ErrorKind::PipeClosed(msg).into()); 1622 | } 1623 | if !events 1624 | .unwrap() 1625 | .intersects(EventFlags::POLLIN | EventFlags::POLLHUP) 1626 | { 1627 | // continue on other events (should not happen) 1628 | debug!("got a continue on other events {:?}", events); 1629 | continue; 1630 | } 1631 | let data: &mut [u8] = &mut [0]; 1632 | let n = read(rfd, data).chain_err(|| "could not read from rfd")?; 1633 | if n == 0 { 1634 | // the wfd was closed so close our end 1635 | close(rfd).chain_err(|| "could not close rfd")?; 1636 | break; 1637 | } 1638 | result.extend(data.iter().cloned()); 1639 | } 1640 | Ok(result) 1641 | } 1642 | 1643 | fn wait_for_pipe_sig(rfd: RawFd, timeout: i32) -> Result> { 1644 | let result = wait_for_pipe_vec(rfd, timeout, 1)?; 1645 | if result.len() < 1 { 1646 | return Ok(None); 1647 | } 1648 | let chain = || "invalid signal"; 1649 | let s = Signal::from_c_int(result[0] as i32).chain_err(chain)?; 1650 | Ok(Some(s)) 1651 | } 1652 | 1653 | fn wait_for_pipe_zero(rfd: RawFd, timeout: i32) -> Result<()> { 1654 | let result = wait_for_pipe_vec(rfd, timeout, 1)?; 1655 | if result.len() < 1 { 1656 | let msg = "file descriptor closed unexpectedly".to_string(); 1657 | return Err(ErrorKind::PipeClosed(msg).into()); 1658 | } 1659 | if result[0] != 0 { 1660 | let msg = format!{"got {} from pipe instead of 0", result[0]}; 1661 | return Err(ErrorKind::InvalidValue(msg).into()); 1662 | } 1663 | Ok(()) 1664 | } 1665 | 1666 | fn wait_for_child(child: Pid) -> Result<(i32, Option)> { 1667 | loop { 1668 | // wait on all children, but only return if we match child. 1669 | let result = match waitpid(Pid::from_raw(-1), None) { 1670 | Err(::nix::Error::Sys(errno)) => { 1671 | // ignore EINTR as it gets sent when we get a SIGCHLD 1672 | if errno == Errno::EINTR { 1673 | continue; 1674 | } 1675 | let msg = format!("could not waitpid on {}", child); 1676 | return Err(::nix::Error::Sys(errno)).chain_err(|| msg)?; 1677 | } 1678 | Err(e) => { 1679 | return Err(e)?; 1680 | } 1681 | Ok(s) => s, 1682 | }; 1683 | match result { 1684 | WaitStatus::Exited(pid, code) => { 1685 | if child != Pid::from_raw(-1) && pid != child { 1686 | continue; 1687 | } 1688 | reap_children()?; 1689 | return Ok((code as i32, None)); 1690 | } 1691 | WaitStatus::Signaled(pid, signal, _) => { 1692 | if child != Pid::from_raw(-1) && pid != child { 1693 | continue; 1694 | } 1695 | reap_children()?; 1696 | return Ok((0, Some(signal))); 1697 | } 1698 | _ => {} 1699 | }; 1700 | } 1701 | } 1702 | 1703 | fn exit(exit_code: i8, sig: Option) -> Result<()> { 1704 | match sig { 1705 | Some(signal) => { 1706 | debug!("child exited with signal {:?}", signal); 1707 | 1708 | signals::raise_for_parent(signal)?; 1709 | // wait for normal signal handler to deal with us 1710 | loop { 1711 | signals::wait_for_signal()?; 1712 | } 1713 | } 1714 | None => { 1715 | debug!("child exited with code {:?}", exit_code); 1716 | std::process::exit(exit_code as i32); 1717 | } 1718 | } 1719 | } 1720 | 1721 | fn reap_children() -> Result<(WaitStatus)> { 1722 | let mut result = WaitStatus::Exited(Pid::from_raw(0), 0); 1723 | loop { 1724 | match waitpid(Pid::from_raw(-1), Some(WaitPidFlag::WNOHANG)) { 1725 | Err(e) => { 1726 | if e != ::nix::Error::Sys(Errno::ECHILD) { 1727 | return Err(e).chain_err(|| "could not waitpid")?; 1728 | } 1729 | // ECHILD means no processes are left 1730 | break; 1731 | } 1732 | Ok(s) => { 1733 | result = s; 1734 | if result == WaitStatus::StillAlive { 1735 | break; 1736 | } 1737 | } 1738 | } 1739 | } 1740 | Ok(result) 1741 | } 1742 | 1743 | fn setid(uid: Uid, gid: Gid) -> Result<()> { 1744 | // set uid/gid 1745 | if let Err(e) = prctl::set_keep_capabilities(true) { 1746 | bail!(format!("set keep capabilities returned {}", e)); 1747 | }; 1748 | { 1749 | setresgid(gid, gid, gid)?; 1750 | } 1751 | { 1752 | setresuid(uid, uid, uid)?; 1753 | } 1754 | // if we change from zero, we lose effective caps 1755 | if uid != Uid::from_raw(0) { 1756 | capabilities::reset_effective()?; 1757 | } 1758 | if let Err(e) = prctl::set_keep_capabilities(false) { 1759 | bail!(format!("set keep capabilities returned {}", e)); 1760 | }; 1761 | Ok(()) 1762 | } 1763 | 1764 | #[cfg(feature = "nightly")] 1765 | fn set_name(name: &str) -> Result<()> { 1766 | match prctl::set_name(name) { 1767 | Err(i) => bail!(format!("set name returned {}", i)), 1768 | Ok(_) => (), 1769 | }; 1770 | unsafe { 1771 | let init = 1772 | std::ffi::CString::new(name).chain_err(|| "invalid process name")?; 1773 | let len = std::ffi::CStr::from_ptr(*ARGV).to_bytes().len(); 1774 | // after fork, ARGV points to the thread's local 1775 | // copy of arg0. 1776 | libc::strncpy(*ARGV, init.as_ptr(), len); 1777 | // no need to set the final character to 0 since 1778 | // the initial string was already null-terminated. 1779 | } 1780 | Ok(()) 1781 | } 1782 | 1783 | #[cfg(not(feature = "nightly"))] 1784 | fn set_name(name: &str) -> Result<()> { 1785 | if let Err(e) = prctl::set_name(name) { 1786 | bail!(format!("set name returned {}", e)); 1787 | }; 1788 | Ok(()) 1789 | } 1790 | -------------------------------------------------------------------------------- /src/mounts.rs: -------------------------------------------------------------------------------- 1 | use cgroups; 2 | use errors::*; 3 | use nix::errno::Errno; 4 | use nix::fcntl::{open, OFlag}; 5 | use nix::mount::MsFlags; 6 | use nix::mount::*; 7 | use nix::sys::stat::{mknod, umask}; 8 | use nix::sys::stat::{Mode, SFlag}; 9 | use nix::unistd::{chdir, chown, close, getcwd, pivot_root}; 10 | use nix::unistd::{Gid, Uid}; 11 | use nix::NixPath; 12 | use nix_ext::fchdir; 13 | use oci::{LinuxDevice, LinuxDeviceType, Mount, Spec}; 14 | use selinux::setfilecon; 15 | use std::collections::HashMap; 16 | use std::fs::OpenOptions; 17 | use std::fs::{canonicalize, create_dir_all, remove_file}; 18 | use std::os::unix::fs::symlink; 19 | use std::path::{Path, PathBuf}; 20 | 21 | pub fn init_rootfs( 22 | spec: &Spec, 23 | rootfs: &str, 24 | cpath: &str, 25 | bind_devices: bool, 26 | ) -> Result<()> { 27 | // set namespace propagation 28 | let mut flags = MsFlags::MS_REC; 29 | match spec.linux { 30 | Some(ref linux) => match linux.rootfs_propagation.as_ref() { 31 | "shared" => { 32 | flags |= MsFlags::MS_SHARED; 33 | Ok(()) 34 | } 35 | "private" => { 36 | flags |= MsFlags::MS_PRIVATE; 37 | Ok(()) 38 | } 39 | "slave" | "" => { 40 | flags |= MsFlags::MS_SLAVE; 41 | Ok(()) 42 | } 43 | _ => { 44 | let msg = format!( 45 | "invalid propogation value: {}", 46 | linux.rootfs_propagation 47 | ); 48 | Err(Error::from(ErrorKind::InvalidSpec(msg))) 49 | } 50 | }, 51 | None => { 52 | flags |= MsFlags::MS_SLAVE; 53 | Ok(()) 54 | } 55 | }?; 56 | let linux = spec.linux.as_ref().unwrap(); 57 | mount(None::<&str>, "/", None::<&str>, flags, None::<&str>)?; 58 | 59 | // mount root dir 60 | mount( 61 | Some(rootfs), 62 | rootfs, 63 | None::<&str>, 64 | MsFlags::MS_BIND | MsFlags::MS_REC, 65 | None::<&str>, 66 | )?; 67 | 68 | for m in &spec.mounts { 69 | // TODO: check for nasty destinations involving symlinks and illegal 70 | // locations. 71 | // NOTE: this strictly is less permissive than runc, which allows .. 72 | // as long as the resulting path remains in the rootfs. There 73 | // is no good reason to allow this so we just forbid it 74 | if !m.destination.starts_with('/') || m.destination.contains("..") { 75 | let msg = format!("invalid mount destination: {}", m.destination); 76 | return Err(ErrorKind::InvalidSpec(msg).into()); 77 | } 78 | let (flags, data) = parse_mount(m); 79 | if m.typ == "cgroup" { 80 | mount_cgroups(m, rootfs, flags, &data, &linux.mount_label, cpath)?; 81 | } else if m.destination == "/dev" { 82 | // dev can't be read only yet because we have to mount devices 83 | mount_from( 84 | m, 85 | rootfs, 86 | flags & !MsFlags::MS_RDONLY, 87 | &data, 88 | &linux.mount_label, 89 | )?; 90 | } else { 91 | mount_from(m, rootfs, flags, &data, &linux.mount_label)?; 92 | } 93 | } 94 | 95 | // chdir into the rootfs so we can make devices with simpler paths 96 | let olddir = getcwd()?; 97 | chdir(rootfs)?; 98 | 99 | default_symlinks()?; 100 | create_devices(&linux.devices, bind_devices)?; 101 | ensure_ptmx()?; 102 | 103 | chdir(&olddir)?; 104 | 105 | Ok(()) 106 | } 107 | 108 | pub fn pivot_rootfs(path: &P) -> Result<()> { 109 | let oldroot = 110 | open("/", OFlag::O_DIRECTORY | OFlag::O_RDONLY, Mode::empty())?; 111 | defer!(close(oldroot).unwrap()); 112 | let newroot = 113 | open(path, OFlag::O_DIRECTORY | OFlag::O_RDONLY, Mode::empty())?; 114 | defer!(close(newroot).unwrap()); 115 | pivot_root(path, path)?; 116 | umount2("/", MntFlags::MNT_DETACH)?; 117 | fchdir(newroot)?; 118 | Ok(()) 119 | } 120 | 121 | pub fn finish_rootfs(spec: &Spec) -> Result<()> { 122 | if let Some(ref linux) = spec.linux { 123 | for path in &linux.masked_paths { 124 | mask_path(path)?; 125 | } 126 | for path in &linux.readonly_paths { 127 | readonly_path(path)?; 128 | } 129 | } 130 | 131 | // remount dev ro if necessary 132 | for m in &spec.mounts { 133 | if m.destination == "/dev" { 134 | let (flags, _) = parse_mount(m); 135 | if flags.contains(MsFlags::MS_RDONLY) { 136 | mount( 137 | Some("/dev"), 138 | "/dev", 139 | None::<&str>, 140 | flags | MsFlags::MS_REMOUNT, 141 | None::<&str>, 142 | )?; 143 | } 144 | } 145 | } 146 | 147 | if spec.root.readonly { 148 | let flags = MsFlags::MS_BIND 149 | | MsFlags::MS_RDONLY 150 | | MsFlags::MS_NODEV 151 | | MsFlags::MS_REMOUNT; 152 | mount(Some("/"), "/", None::<&str>, flags, None::<&str>)?; 153 | } 154 | 155 | umask(Mode::from_bits_truncate(0o022)); 156 | Ok(()) 157 | } 158 | 159 | #[cfg_attr(rustfmt, rustfmt_skip)] 160 | lazy_static! { 161 | static ref OPTIONS: HashMap<&'static str, (bool, MsFlags)> = { 162 | let mut m = HashMap::new(); 163 | m.insert("defaults", (false, MsFlags::empty())); 164 | m.insert("ro", (false, MsFlags::MS_RDONLY)); 165 | m.insert("rw", (true, MsFlags::MS_RDONLY)); 166 | m.insert("suid", (true, MsFlags::MS_NOSUID)); 167 | m.insert("nosuid", (false, MsFlags::MS_NOSUID)); 168 | m.insert("dev", (true, MsFlags::MS_NODEV)); 169 | m.insert("nodev", (false, MsFlags::MS_NODEV)); 170 | m.insert("exec", (true, MsFlags::MS_NOEXEC)); 171 | m.insert("noexec", (false, MsFlags::MS_NOEXEC)); 172 | m.insert("sync", (false, MsFlags::MS_SYNCHRONOUS)); 173 | m.insert("async", (true, MsFlags::MS_SYNCHRONOUS)); 174 | m.insert("dirsync", (false, MsFlags::MS_DIRSYNC)); 175 | m.insert("remount", (false, MsFlags::MS_REMOUNT)); 176 | m.insert("mand", (false, MsFlags::MS_MANDLOCK)); 177 | m.insert("nomand", (true, MsFlags::MS_MANDLOCK)); 178 | m.insert("atime", (true, MsFlags::MS_NOATIME)); 179 | m.insert("noatime", (false, MsFlags::MS_NOATIME)); 180 | m.insert("diratime", (true, MsFlags::MS_NODIRATIME)); 181 | m.insert("nodiratime", (false, MsFlags::MS_NODIRATIME)); 182 | m.insert("bind", (false, MsFlags::MS_BIND)); 183 | m.insert("rbind", (false, MsFlags::MS_BIND | MsFlags::MS_REC)); 184 | m.insert("unbindable", (false, MsFlags::MS_UNBINDABLE)); 185 | m.insert("runbindable", (false, MsFlags::MS_UNBINDABLE | MsFlags::MS_REC)); 186 | m.insert("private", (false, MsFlags::MS_PRIVATE)); 187 | m.insert("rprivate", (false, MsFlags::MS_PRIVATE | MsFlags::MS_REC)); 188 | m.insert("shared", (false, MsFlags::MS_SHARED)); 189 | m.insert("rshared", (false, MsFlags::MS_SHARED | MsFlags::MS_REC)); 190 | m.insert("slave", (false, MsFlags::MS_SLAVE)); 191 | m.insert("rslave", (false, MsFlags::MS_SLAVE | MsFlags::MS_REC)); 192 | m.insert("relatime", (false, MsFlags::MS_RELATIME)); 193 | m.insert("norelatime", (true, MsFlags::MS_RELATIME)); 194 | m.insert("strictatime", (false, MsFlags::MS_STRICTATIME)); 195 | m.insert("nostrictatime", (true, MsFlags::MS_STRICTATIME)); 196 | m 197 | }; 198 | } 199 | 200 | fn mount_cgroups( 201 | m: &Mount, 202 | rootfs: &str, 203 | flags: MsFlags, 204 | data: &str, 205 | label: &str, 206 | cpath: &str, 207 | ) -> Result<()> { 208 | let cm = Mount { 209 | source: "tmpfs".to_string(), 210 | typ: "tmpfs".to_string(), 211 | destination: m.destination.clone(), 212 | options: Vec::new(), 213 | }; 214 | let cflags = MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV; 215 | // mount tmpfs for mounts 216 | mount_from(&cm, rootfs, cflags, "", label)?; 217 | for (key, mount_path) in cgroups::MOUNTS.iter() { 218 | let source = if let Some(s) = cgroups::path(key, cpath) { 219 | s 220 | } else { 221 | continue; 222 | }; 223 | 224 | // NOTE: this will bind mount over the same location if two 225 | // cgroups are mounted to directories with the same 226 | // value at the end of the path, for example: 227 | // /path/to/a/cgroup /path/to/b/cgroup 228 | // runc mounts by using the final path component, so 229 | // we do the same thing here. 230 | let base = if let Some(o) = mount_path.rfind('/') { 231 | &mount_path[o + 1..] 232 | } else { 233 | &mount_path[..] 234 | }; 235 | let dest = format!{"{}/{}", &m.destination, &base}; 236 | let bm = Mount { 237 | source: source, 238 | typ: "bind".to_string(), 239 | destination: dest, 240 | options: Vec::new(), 241 | }; 242 | mount_from( 243 | &bm, 244 | rootfs, 245 | flags | MsFlags::MS_BIND | MsFlags::MS_REC, 246 | data, 247 | label, 248 | )?; 249 | for k in key.split(',') { 250 | if k != key { 251 | // try to create a symlink for combined strings 252 | let dest = format!{"{}{}/{}", rootfs, &m.destination, &k}; 253 | symlink(key, &dest)?; 254 | } 255 | } 256 | } 257 | // remount readonly if necessary 258 | if flags.contains(MsFlags::MS_RDONLY) { 259 | let dest = format!{"{}{}", rootfs, &m.destination}; 260 | mount( 261 | Some(&*dest), 262 | &*dest, 263 | None::<&str>, 264 | cflags | MsFlags::MS_BIND | MsFlags::MS_REMOUNT, 265 | None::<&str>, 266 | )?; 267 | } 268 | Ok(()) 269 | } 270 | 271 | fn parse_mount(m: &Mount) -> (MsFlags, String) { 272 | let mut flags = MsFlags::empty(); 273 | let mut data = Vec::new(); 274 | for s in &m.options { 275 | match OPTIONS.get(s.as_str()) { 276 | Some(x) => { 277 | let (clear, f) = *x; 278 | if clear { 279 | flags &= !f; 280 | } else { 281 | flags |= f; 282 | } 283 | } 284 | None => { 285 | data.push(s.as_str()); 286 | } 287 | }; 288 | } 289 | (flags, data.join(",")) 290 | } 291 | 292 | fn mount_from( 293 | m: &Mount, 294 | rootfs: &str, 295 | flags: MsFlags, 296 | data: &str, 297 | label: &str, 298 | ) -> Result<()> { 299 | let d; 300 | if !label.is_empty() && m.typ != "proc" && m.typ != "sysfs" { 301 | if data.is_empty() { 302 | d = format!{"context=\"{}\"", label}; 303 | } else { 304 | d = format!{"{},context=\"{}\"", data, label}; 305 | } 306 | } else { 307 | d = data.to_string(); 308 | } 309 | 310 | let dest = format!{"{}{}", rootfs, &m.destination}; 311 | 312 | debug!( 313 | "mounting {} to {} as {} with data '{}'", 314 | &m.source, &m.destination, &m.typ, &d 315 | ); 316 | 317 | let src = if m.typ == "bind" { 318 | let src = canonicalize(&m.source)?; 319 | let dir = if src.is_file() { 320 | Path::new(&dest).parent().unwrap() 321 | } else { 322 | Path::new(&dest) 323 | }; 324 | if let Err(e) = create_dir_all(&dir) { 325 | debug!("ignoring create dir fail of {:?}: {}", &dir, e) 326 | } 327 | // make sure file exists so we can bind over it 328 | if src.is_file() { 329 | if let Err(e) = 330 | OpenOptions::new().create(true).write(true).open(&dest) 331 | { 332 | debug!("ignoring touch fail of {:?}: {}", &dest, e) 333 | } 334 | } 335 | src 336 | } else { 337 | if let Err(e) = create_dir_all(&dest) { 338 | debug!("ignoring create dir fail of {:?}: {}", &dest, e) 339 | } 340 | PathBuf::from(&m.source) 341 | }; 342 | 343 | if let Err(::nix::Error::Sys(errno)) = 344 | mount(Some(&*src), &*dest, Some(&*m.typ), flags, Some(&*d)) 345 | { 346 | if errno != Errno::EINVAL { 347 | let chain = || format!("mount of {} failed", &m.destination); 348 | return Err(::nix::Error::Sys(errno)).chain_err(chain)?; 349 | } 350 | // try again without mount label 351 | mount(Some(&*src), &*dest, Some(&*m.typ), flags, Some(data))?; 352 | // warn if label cannot be set 353 | if let Err(e) = setfilecon(&dest, label) { 354 | warn!{"could not set mount label of {} to {}: {}", 355 | &m.destination, &label, e}; 356 | } 357 | } 358 | // remount bind mounts if they have other flags (like MsFlags::MS_RDONLY) 359 | if flags.contains(MsFlags::MS_BIND) 360 | && flags.intersects( 361 | !(MsFlags::MS_REC 362 | | MsFlags::MS_REMOUNT 363 | | MsFlags::MS_BIND 364 | | MsFlags::MS_PRIVATE 365 | | MsFlags::MS_SHARED 366 | | MsFlags::MS_SLAVE), 367 | ) { 368 | let chain = || format!("remount of {} failed", &dest); 369 | mount( 370 | Some(&*dest), 371 | &*dest, 372 | None::<&str>, 373 | flags | MsFlags::MS_REMOUNT, 374 | None::<&str>, 375 | ).chain_err(chain)?; 376 | } 377 | Ok(()) 378 | } 379 | 380 | static SYMLINKS: &'static [(&'static str, &'static str)] = &[ 381 | ("/proc/self/fd", "dev/fd"), 382 | ("/proc/self/fd/0", "dev/stdin"), 383 | ("/proc/self/fd/1", "dev/stdout"), 384 | ("/proc/self/fd/2", "dev/stderr"), 385 | ]; 386 | 387 | fn default_symlinks() -> Result<()> { 388 | if Path::new("/proc/kcore").exists() { 389 | symlink("/proc/kcore", "dev/kcore")?; 390 | } 391 | for &(src, dst) in SYMLINKS { 392 | symlink(src, dst)?; 393 | } 394 | Ok(()) 395 | } 396 | fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { 397 | let op: fn(&LinuxDevice) -> Result<()> = 398 | if bind { bind_dev } else { mknod_dev }; 399 | let old = umask(Mode::from_bits_truncate(0o000)); 400 | for dev in super::DEFAULT_DEVICES.iter() { 401 | op(dev)?; 402 | } 403 | for dev in devices { 404 | if !dev.path.starts_with("/dev") || dev.path.contains("..") { 405 | let msg = format!("{} is not a valid device path", dev.path); 406 | bail!(ErrorKind::InvalidSpec(msg)); 407 | } 408 | op(dev)?; 409 | } 410 | umask(old); 411 | Ok(()) 412 | } 413 | 414 | fn ensure_ptmx() -> Result<()> { 415 | if let Err(e) = remove_file("dev/ptmx") { 416 | if e.kind() != ::std::io::ErrorKind::NotFound { 417 | let msg = "could not delete /dev/ptmx".to_string(); 418 | Err(e).chain_err(|| msg)?; 419 | } 420 | } 421 | symlink("pts/ptmx", "dev/ptmx")?; 422 | Ok(()) 423 | } 424 | 425 | fn makedev(major: u64, minor: u64) -> u64 { 426 | (minor & 0xff) 427 | | ((major & 0xfff) << 8) 428 | | ((minor & !0xff) << 12) 429 | | ((major & !0xfff) << 32) 430 | } 431 | 432 | fn to_sflag(t: LinuxDeviceType) -> Result { 433 | Ok(match t { 434 | LinuxDeviceType::b => SFlag::S_IFBLK, 435 | LinuxDeviceType::c | LinuxDeviceType::u => SFlag::S_IFCHR, 436 | LinuxDeviceType::p => SFlag::S_IFIFO, 437 | LinuxDeviceType::a => { 438 | let msg = "type a is not allowed for linux device".to_string(); 439 | bail!(ErrorKind::InvalidSpec(msg)); 440 | } 441 | }) 442 | } 443 | 444 | fn mknod_dev(dev: &LinuxDevice) -> Result<()> { 445 | let f = to_sflag(dev.typ)?; 446 | debug!("mknoding {}", &dev.path); 447 | mknod( 448 | &dev.path[1..], 449 | f, 450 | Mode::from_bits_truncate(dev.file_mode.unwrap_or(0)), 451 | makedev(dev.major, dev.minor), 452 | )?; 453 | chown( 454 | &dev.path[1..], 455 | dev.uid.map(|n| Uid::from_raw(n)), 456 | dev.gid.map(|n| Gid::from_raw(n)), 457 | )?; 458 | Ok(()) 459 | } 460 | 461 | fn bind_dev(dev: &LinuxDevice) -> Result<()> { 462 | let fd = open( 463 | &dev.path[1..], 464 | OFlag::O_RDWR | OFlag::O_CREAT, 465 | Mode::from_bits_truncate(0o644), 466 | )?; 467 | close(fd)?; 468 | debug!("bind mounting {}", &dev.path); 469 | mount( 470 | Some(&*dev.path), 471 | &dev.path[1..], 472 | None::<&str>, 473 | MsFlags::MS_BIND, 474 | None::<&str>, 475 | )?; 476 | Ok(()) 477 | } 478 | 479 | fn mask_path(path: &str) -> Result<()> { 480 | if !path.starts_with('/') || path.contains("..") { 481 | let msg = format!("invalid maskedPath: {}", path); 482 | return Err(ErrorKind::InvalidSpec(msg).into()); 483 | } 484 | 485 | if let Err(::nix::Error::Sys(errno)) = mount( 486 | Some("/dev/null"), 487 | path, 488 | None::<&str>, 489 | MsFlags::MS_BIND, 490 | None::<&str>, 491 | ) { 492 | // ignore ENOENT and ENOTDIR: path to mask doesn't exist 493 | if errno != Errno::ENOENT && errno != Errno::ENOTDIR { 494 | let msg = format!("could not mask {}", path); 495 | Err(::nix::Error::Sys(errno)).chain_err(|| msg)?; 496 | } else { 497 | debug!("ignoring mask of {} because it doesn't exist", path); 498 | } 499 | } 500 | Ok(()) 501 | } 502 | 503 | fn readonly_path(path: &str) -> Result<()> { 504 | if !path.starts_with('/') || path.contains("..") { 505 | let msg = format!("invalid readonlyPath: {}", path); 506 | return Err(ErrorKind::InvalidSpec(msg).into()); 507 | } 508 | if let Err(e) = mount( 509 | Some(&path[1..]), 510 | path, 511 | None::<&str>, 512 | MsFlags::MS_BIND | MsFlags::MS_REC, 513 | None::<&str>, 514 | ) { 515 | match e { 516 | ::nix::Error::Sys(errno) => { 517 | // ignore ENOENT: path to make read only doesn't exist 518 | if errno != Errno::ENOENT { 519 | let msg = format!("could not readonly {}", path); 520 | Err(e).chain_err(|| msg)?; 521 | } 522 | debug!("ignoring remount of {} because it doesn't exist", path); 523 | return Ok(()); 524 | } 525 | _ => { 526 | unreachable!("Supposedly unreachable error {:?}", e); 527 | } 528 | } 529 | } 530 | mount( 531 | Some(&path[1..]), 532 | &path[1..], 533 | None::<&str>, 534 | MsFlags::MS_BIND 535 | | MsFlags::MS_REC 536 | | MsFlags::MS_RDONLY 537 | | MsFlags::MS_REMOUNT, 538 | None::<&str>, 539 | )?; 540 | Ok(()) 541 | } 542 | -------------------------------------------------------------------------------- /src/nix_ext.rs: -------------------------------------------------------------------------------- 1 | // Functions in libc that haven't made it into nix yet 2 | use libc; 3 | use nix::errno::Errno; 4 | use nix::Result; 5 | use std::ffi::CString; 6 | use std::os::unix::io::RawFd; 7 | 8 | #[inline] 9 | pub fn lsetxattr( 10 | path: &CString, 11 | name: &CString, 12 | value: &CString, 13 | len: usize, 14 | flags: i32, 15 | ) -> Result<()> { 16 | let res = unsafe { 17 | libc::lsetxattr( 18 | path.as_ptr(), 19 | name.as_ptr(), 20 | value.as_ptr() as *const libc::c_void, 21 | len, 22 | flags, 23 | ) 24 | }; 25 | Errno::result(res).map(drop) 26 | } 27 | 28 | #[inline] 29 | pub fn fchdir(fd: RawFd) -> Result<()> { 30 | let res = unsafe { libc::fchdir(fd) }; 31 | Errno::result(res).map(drop) 32 | } 33 | 34 | #[inline] 35 | pub fn setgroups(gids: &[libc::gid_t]) -> Result<()> { 36 | let res = unsafe { libc::setgroups(gids.len(), gids.as_ptr()) }; 37 | Errno::result(res).map(drop) 38 | } 39 | 40 | #[inline] 41 | pub fn setrlimit( 42 | resource: libc::c_int, 43 | soft: libc::c_ulonglong, 44 | hard: libc::c_ulonglong, 45 | ) -> Result<()> { 46 | let rlim = &libc::rlimit { 47 | rlim_cur: soft, 48 | rlim_max: hard, 49 | }; 50 | let res = unsafe { libc::setrlimit(resource, rlim) }; 51 | Errno::result(res).map(drop) 52 | } 53 | 54 | #[inline] 55 | pub fn clearenv() -> Result<()> { 56 | let res = unsafe { libc::clearenv() }; 57 | Errno::result(res).map(drop) 58 | } 59 | 60 | #[cfg(target_env = "gnu")] 61 | #[inline] 62 | pub fn putenv(string: &CString) -> Result<()> { 63 | // NOTE: gnue takes ownership of the string so we pass it 64 | // with into_raw. 65 | // This prevents the string to be de-allocated. 66 | // According to 67 | // https://www.gnu.org/software/libc/manual/html_node/Environment-Access.html 68 | // the variable will be accessable from the exec'd program 69 | // throughout its lifetime, as such this is not going to be re-claimed 70 | // and will show up as leak in valgrind and friends. 71 | let ptr = string.clone().into_raw(); 72 | let res = unsafe { libc::putenv(ptr as *mut libc::c_char) }; 73 | Errno::result(res).map(drop) 74 | } 75 | 76 | #[cfg(not(target_env = "gnu"))] 77 | pub fn putenv(string: &CString) -> Result<()> { 78 | let res = unsafe { libc::putenv(string.as_ptr() as *mut libc::c_char) }; 79 | Errno::result(res).map(drop) 80 | } 81 | -------------------------------------------------------------------------------- /src/seccomp.rs: -------------------------------------------------------------------------------- 1 | use errors::*; 2 | use oci::{Arch, LinuxSeccomp, LinuxSeccompOperator}; 3 | use seccomp_sys::*; 4 | 5 | fn to_arch(arch: Arch) -> scmp_arch { 6 | unsafe { ::std::mem::transmute(arch) } 7 | } 8 | 9 | fn to_cmp(cmp: LinuxSeccompOperator) -> scmp_compare { 10 | unsafe { ::std::mem::transmute(cmp) } 11 | } 12 | 13 | fn syscall_resolve_name(name: &str) -> ::Result { 14 | let s = ::std::ffi::CString::new(name)?; 15 | let id = unsafe { seccomp_syscall_resolve_name(s.as_ptr()) }; 16 | if id == __NR_SCMP_ERROR { 17 | let msg = format!("could not resolve {}", name); 18 | Err(ErrorKind::SeccompError(msg).into()) 19 | } else { 20 | Ok(id) 21 | } 22 | } 23 | 24 | fn init(act: u32) -> Result<*mut scmp_filter_ctx> { 25 | let filter_ctx = unsafe { seccomp_init(act) }; 26 | if filter_ctx.is_null() { 27 | let msg = "initialization failed".to_string(); 28 | Err(ErrorKind::SeccompError(msg).into()) 29 | } else { 30 | Ok(filter_ctx) 31 | } 32 | } 33 | 34 | fn arch_add(ctx: *mut scmp_filter_ctx, arch: scmp_arch) -> ::Result { 35 | let id = unsafe { seccomp_arch_add(ctx, arch as u32) }; 36 | if id == __NR_SCMP_ERROR { 37 | let msg = format!("could not add arch {:?}", arch); 38 | Err(ErrorKind::SeccompError(msg).into()) 39 | } else { 40 | Ok(id) 41 | } 42 | } 43 | 44 | fn rule_add( 45 | ctx: *mut scmp_filter_ctx, 46 | act: u32, 47 | id: i32, 48 | cmps: &[scmp_arg_cmp], 49 | ) -> Result<()> { 50 | let res = unsafe { 51 | let ptr = if cmps.is_empty() { 52 | ::std::ptr::null() 53 | } else { 54 | cmps.as_ptr() 55 | }; 56 | seccomp_rule_add_array(ctx, act, id, cmps.len() as u32, ptr) 57 | }; 58 | if res != 0 { 59 | let msg = format!("failed to add rule for {}", id); 60 | Err(ErrorKind::SeccompError(msg).into()) 61 | } else { 62 | Ok(()) 63 | } 64 | } 65 | 66 | fn attr_set( 67 | ctx: *mut scmp_filter_ctx, 68 | attr: scmp_filter_attr, 69 | value: u32, 70 | ) -> Result<()> { 71 | let res = unsafe { seccomp_attr_set(ctx, attr, value) }; 72 | if res != 0 { 73 | let msg = "failed to set_attr".to_string(); 74 | Err(ErrorKind::SeccompError(msg).into()) 75 | } else { 76 | Ok(()) 77 | } 78 | } 79 | 80 | fn load(ctx: *mut scmp_filter_ctx) -> Result<()> { 81 | let res = unsafe { seccomp_load(ctx) }; 82 | if res != 0 { 83 | let msg = "failed to load filter".to_string(); 84 | Err(ErrorKind::SeccompError(msg).into()) 85 | } else { 86 | Ok(()) 87 | } 88 | } 89 | 90 | pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> ::Result<()> { 91 | let ctx = init(seccomp.default_action as u32)?; 92 | // set control NoNewPrivs to false, as we deal with it separately 93 | attr_set(ctx, scmp_filter_attr::SCMP_FLTATR_CTL_NNP, false as u32)?; 94 | // set up architectures 95 | for arch in &seccomp.architectures { 96 | arch_add(ctx, to_arch(*arch))?; 97 | } 98 | // add actions for syscalls 99 | for syscall in &seccomp.syscalls { 100 | let mut names = syscall.names.clone(); 101 | if names.is_empty() { 102 | names.push(syscall.name.clone()) 103 | }; 104 | for name in names { 105 | let id = match syscall_resolve_name(&name) { 106 | Ok(result) => result, 107 | Err(e) => { 108 | info!("Skipping unknown syscall: {}", e); 109 | continue; 110 | } 111 | }; 112 | let mut cmps = Vec::new(); 113 | for arg in &syscall.args { 114 | cmps.push(scmp_arg_cmp { 115 | arg: arg.index as u32, 116 | op: to_cmp(arg.op), 117 | datum_a: arg.value as scmp_datum_t, 118 | datum_b: arg.value_two as scmp_datum_t, 119 | }); 120 | } 121 | 122 | rule_add(ctx, syscall.action as u32, id, &cmps)?; 123 | } 124 | } 125 | load(ctx)?; 126 | Ok(()) 127 | } 128 | -------------------------------------------------------------------------------- /src/selinux.rs: -------------------------------------------------------------------------------- 1 | use errors::*; 2 | use nix::fcntl::{open, OFlag}; 3 | use nix::sys::stat::Mode; 4 | use nix::unistd::{close, write}; 5 | use nix_ext::lsetxattr; 6 | use std::ffi::CString; 7 | 8 | const EXEC_PATH: &'static str = "/proc/self/attr/exec"; 9 | 10 | pub fn setexeccon(label: &str) -> Result<()> { 11 | let fd = open(EXEC_PATH, OFlag::O_RDWR, Mode::empty())?; 12 | defer!(close(fd).unwrap()); 13 | write(fd, label.as_bytes())?; 14 | Ok(()) 15 | } 16 | 17 | const XATTR_NAME: &'static str = "security.selinux"; 18 | 19 | pub fn setfilecon(file: &str, label: &str) -> Result<()> { 20 | let path = CString::new(file)?; 21 | let name = CString::new(XATTR_NAME)?; 22 | let value = CString::new(label)?; 23 | lsetxattr(&path, &name, &value, label.len(), 0)?; 24 | Ok(()) 25 | } 26 | -------------------------------------------------------------------------------- /src/signals.rs: -------------------------------------------------------------------------------- 1 | use errors::*; 2 | use libc::c_int; 3 | use nix::sys::signal::{kill, raise, sigaction}; 4 | use nix::sys::signal::{SaFlags, SigAction, SigHandler, SigSet, Signal}; 5 | use nix::unistd::Pid; 6 | 7 | pub fn pass_signals(child_pid: Pid) -> Result<()> { 8 | unsafe { 9 | CHILD_PID = Some(child_pid); 10 | set_handler(SigHandler::Handler(child_handler))?; 11 | } 12 | Ok(()) 13 | } 14 | 15 | // NOTE: signal handlers need to know which child to pass 16 | // a signal to. We store the child's pid in a global variable. 17 | // The child pid is only set once prior to setting up the 18 | // signal handler, so it should be safe to access it from the 19 | // signal handler. 20 | static mut CHILD_PID: Option = None; 21 | 22 | extern "C" fn child_handler(signo: c_int) { 23 | unsafe { 24 | let _ = kill( 25 | CHILD_PID.unwrap_or(Pid::from_raw(0)), 26 | Signal::from_c_int(signo).unwrap(), 27 | ); 28 | } 29 | } 30 | 31 | unsafe fn set_handler(handler: SigHandler) -> Result<()> { 32 | let a = SigAction::new(handler, SaFlags::empty(), SigSet::all()); 33 | sigaction(Signal::SIGTERM, &a).chain_err(|| "failed to sigaction")?; 34 | sigaction(Signal::SIGQUIT, &a).chain_err(|| "failed to sigaction")?; 35 | sigaction(Signal::SIGINT, &a).chain_err(|| "failed to sigaction")?; 36 | sigaction(Signal::SIGHUP, &a).chain_err(|| "failed to sigaction")?; 37 | sigaction(Signal::SIGUSR1, &a).chain_err(|| "failed to sigaction")?; 38 | sigaction(Signal::SIGUSR2, &a).chain_err(|| "failed to sigaction")?; 39 | Ok(()) 40 | } 41 | 42 | pub fn signal_children(signal: Signal) -> Result<()> { 43 | // don't signal this thread 44 | let mut s = SigSet::empty(); 45 | s.add(signal); 46 | s.thread_block()?; 47 | kill(Pid::from_raw(0), signal)?; 48 | Ok(()) 49 | } 50 | 51 | pub fn to_signal(signal: &str) -> Result { 52 | Ok(match signal { 53 | "1" | "HUP" | "SIGHUP" => Signal::SIGHUP, 54 | "2" | "INT" | "SIGINT" => Signal::SIGINT, 55 | "3" | "QUIT" | "SIGQUIT" => Signal::SIGQUIT, 56 | "4" | "ILL" | "SIGILL" => Signal::SIGILL, 57 | "5" | "BUS" | "SIGBUS" => Signal::SIGBUS, 58 | "6" | "ABRT" | "IOT" | "SIGABRT" | "SIGIOT" => Signal::SIGABRT, 59 | "7" | "TRAP" | "SIGTRAP" => Signal::SIGTRAP, 60 | "8" | "FPE" | "SIGFPE" => Signal::SIGFPE, 61 | "9" | "KILL" | "SIGKILL" => Signal::SIGKILL, 62 | "10" | "USR1" | "SIGUSR1" => Signal::SIGUSR1, 63 | "11" | "SEGV" | "SIGSEGV" => Signal::SIGSEGV, 64 | "12" | "USR2" | "SIGUSR2" => Signal::SIGUSR2, 65 | "13" | "PIPE" | "SIGPIPE" => Signal::SIGPIPE, 66 | "14" | "ALRM" | "SIGALRM" => Signal::SIGALRM, 67 | "15" | "TERM" | "SIGTERM" => Signal::SIGTERM, 68 | "16" | "STKFLT" | "SIGSTKFLT" => Signal::SIGSTKFLT, 69 | "17" | "CHLD" | "SIGCHLD" => Signal::SIGCHLD, 70 | "18" | "CONT" | "SIGCONT" => Signal::SIGCONT, 71 | "19" | "STOP" | "SIGSTOP" => Signal::SIGSTOP, 72 | "20" | "TSTP" | "SIGTSTP" => Signal::SIGTSTP, 73 | "21" | "TTIN" | "SIGTTIN" => Signal::SIGTTIN, 74 | "22" | "TTOU" | "SIGTTOU" => Signal::SIGTTOU, 75 | "23" | "URG" | "SIGURG" => Signal::SIGURG, 76 | "24" | "XCPU" | "SIGXCPU" => Signal::SIGXCPU, 77 | "25" | "XFSZ" | "SIGXFSZ" => Signal::SIGXFSZ, 78 | "26" | "VTALRM" | "SIGVTALRM" => Signal::SIGVTALRM, 79 | "27" | "PROF" | "SIGPROF" => Signal::SIGPROF, 80 | "28" | "WINCH" | "SIGWINCH" => Signal::SIGWINCH, 81 | "29" | "IO" | "SIGIO" => Signal::SIGIO, 82 | "30" | "PWR" | "SIGPWR" => Signal::SIGPWR, 83 | "31" | "SYS" | "SIGSYS" => Signal::SIGSYS, 84 | _ => bail!{"{} is not a valid signal", signal}, 85 | }) 86 | } 87 | 88 | pub fn signal_process>>( 89 | pid: Pid, 90 | signal: T, 91 | ) -> Result<()> { 92 | kill(pid, signal)?; 93 | Ok(()) 94 | } 95 | 96 | pub fn raise_for_parent(signal: Signal) -> Result<()> { 97 | // reset the sigaction for the signal 98 | if signal != Signal::SIGKILL && signal != Signal::SIGSTOP { 99 | let a = 100 | SigAction::new(SigHandler::SigDfl, SaFlags::empty(), SigSet::all()); 101 | unsafe { 102 | sigaction(signal, &a).chain_err(|| "failed to sigaction")?; 103 | } 104 | } 105 | // make sure the signal is unblocked 106 | let mut s = SigSet::empty(); 107 | s.add(signal); 108 | s.thread_unblock().chain_err(|| "failed to unblock signal")?; 109 | // raise the signal 110 | raise(signal).chain_err(|| format!("failed to raise signal {:?}", signal))?; 111 | Ok(()) 112 | } 113 | 114 | pub fn wait_for_signal() -> Result { 115 | let s = SigSet::all(); 116 | s.thread_block()?; 117 | let result = s.wait()?; 118 | s.thread_unblock()?; 119 | Ok(result) 120 | } 121 | -------------------------------------------------------------------------------- /src/sync.rs: -------------------------------------------------------------------------------- 1 | use super::Result; 2 | use nix::fcntl::OFlag; 3 | use nix::unistd::{close, pipe2, read}; 4 | use std::os::unix::io::RawFd; 5 | 6 | pub struct Cond { 7 | rfd: RawFd, 8 | wfd: RawFd, 9 | } 10 | 11 | impl Cond { 12 | pub fn new() -> Result { 13 | let (rfd, wfd) = pipe2(OFlag::O_CLOEXEC)?; 14 | Ok(Cond { rfd: rfd, wfd: wfd }) 15 | } 16 | 17 | pub fn wait(&self) -> Result<()> { 18 | close(self.wfd)?; 19 | let data: &mut [u8] = &mut [0]; 20 | while read(self.rfd, data)? != 0 {} 21 | close(self.rfd)?; 22 | Ok(()) 23 | } 24 | pub fn notify(&self) -> Result<()> { 25 | close(self.rfd)?; 26 | close(self.wfd)?; 27 | Ok(()) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /wercker.yml: -------------------------------------------------------------------------------- 1 | box: rust:latest 2 | build: 3 | steps: 4 | - install-packages: 5 | packages: libseccomp-dev build-essential 6 | - script: 7 | name: cargo deps 8 | code: cargo install cargo-when 9 | - script: 10 | name: build 11 | code: ./build.sh 12 | 13 | build-nightly: 14 | box: scorpil/rust:nightly 15 | steps: 16 | - install-packages: 17 | packages: libseccomp-dev build-essential 18 | - script: 19 | name: cargo deps 20 | code: cargo install cargo-when 21 | - script: 22 | name: build 23 | code: ./build.sh 24 | 25 | build-musl: 26 | box: 27 | id: ekidd/rust-musl-builder:stable 28 | cmd: /usr/bin/sudo -E /bin/bash 29 | steps: 30 | - script: 31 | name: update path 32 | code: export PATH=$PATH:/home/rust/.cargo/bin 33 | - install-packages: 34 | packages: build-essential dh-autoreconf musl-tools git 35 | - script: 36 | name: pull seccomp 37 | code: | 38 | git submodule update --init 39 | - script: 40 | name: cargo deps 41 | code: cargo install cargo-when 42 | - script: 43 | name: build 44 | code: | 45 | TARGET=x86_64-unknown-linux-musl ./build.sh 46 | 47 | build-musl-nightly: 48 | box: 49 | id: ekidd/rust-musl-builder:nightly 50 | cmd: /usr/bin/sudo -E /bin/bash 51 | steps: 52 | - script: 53 | name: update path 54 | code: export PATH=$PATH:/home/rust/.cargo/bin 55 | - install-packages: 56 | packages: build-essential dh-autoreconf musl-tools git 57 | - script: 58 | name: pull seccomp 59 | code: | 60 | git submodule update --init 61 | - script: 62 | name: cargo deps 63 | code: cargo install cargo-when 64 | - script: 65 | name: build 66 | code: | 67 | TARGET=x86_64-unknown-linux-musl ./build.sh 68 | --------------------------------------------------------------------------------