├── website ├── content │ └── specs │ │ └── _index.md ├── archetypes │ └── default.md ├── assets │ ├── _variables.scss │ └── _custom.scss ├── layouts │ └── partials │ │ └── docs │ │ └── links │ │ └── edit.html ├── config.toml └── README.md ├── .gitignore ├── specs ├── sysext.md ├── discoverable_disk_image.md ├── configuration_files_specification.md ├── vmgenid.md ├── version_format_specification.md ├── elf_dlopen_metadata.md ├── package_metadata_for_executable_files.md ├── extension_image.md ├── linux_tpm_pcr_registry.md ├── unified_kernel_image.md ├── osc_context.md ├── vmclock.md ├── linux_file_system_hierarchy.md └── discoverable_partitions_specification.md ├── .gitmodules ├── .github └── workflows │ └── gh-pages.yml └── README.md /website/content/specs/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Specifications" 3 | --- 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | website/public 2 | website/.hugo_build.lock 3 | website/resources/_gen 4 | public/ 5 | -------------------------------------------------------------------------------- /specs/sysext.md: -------------------------------------------------------------------------------- 1 | --- 2 | bookHidden: true 3 | --- 4 | This content has moved to [Extension Images](extension_image.md) 5 | -------------------------------------------------------------------------------- /website/archetypes/default.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "{{ replace .Name "-" " " | title }}" 3 | date: {{ .Date }} 4 | draft: true 5 | --- 6 | 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "website/themes/hugo-book"] 2 | path = website/themes/hugo-book 3 | url = https://github.com/alex-shpak/hugo-book 4 | -------------------------------------------------------------------------------- /website/assets/_variables.scss: -------------------------------------------------------------------------------- 1 | $body-min-width: 20rem; 2 | $container-max-width: 110rem; 3 | 4 | $menu-width: 15rem; 5 | $toc-width: 15rem; 6 | -------------------------------------------------------------------------------- /website/assets/_custom.scss: -------------------------------------------------------------------------------- 1 | .book-menu nav>ul:first-of-type>li:last-child::after { 2 | content: "⸻"; 3 | display: inline-block; 4 | margin: 0.5em 0; 5 | } 6 | 7 | .book-menu nav>ul:last-of-type>li:last-child::before { 8 | content: "⸻"; 9 | display: inline-block; 10 | margin: 0.5em 0; 11 | } 12 | -------------------------------------------------------------------------------- /website/layouts/partials/docs/links/edit.html: -------------------------------------------------------------------------------- 1 | {{- return (partial "docs/text/template" (dict "Template" .Site.Params.BookEditLink "Context" (dict 2 | "Site" .Site 3 | "Page" .Page 4 | "Path" (strings.TrimPrefix hugo.WorkingDir ( replace .Page.File.Path "_index.md" .Site.Params.BookIndexPage)) 5 | )) | urls.JoinPath) -}} 6 | -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | deploy: 11 | if: github.repository == 'uapi-group/specifications' 12 | runs-on: ubuntu-22.04 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | submodules: true 19 | fetch-depth: 0 20 | 21 | - name: Setup Hugo 22 | uses: peaceiris/actions-hugo@v2 23 | with: 24 | hugo-version: '0.147.8' 25 | extended: true 26 | 27 | - name: Build 28 | run: cd website && hugo --minify -d ../public 29 | 30 | - name: Deploy 31 | uses: peaceiris/actions-gh-pages@v3 32 | if: ${{ github.ref == 'refs/heads/main' }} 33 | with: 34 | github_token: ${{ secrets.GITHUB_TOKEN }} 35 | publish_dir: ./public 36 | -------------------------------------------------------------------------------- /website/config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "https://uapi-group.org/specifications" 2 | languageCode = "en-us" 3 | title = "UAPI Group Specifications" 4 | theme = "hugo-book" 5 | copyright = 'Licensed under [CC-BY-4.0](https://spdx.org/licenses/CC-BY-4.0.html)' 6 | 7 | [[menu.before]] 8 | name = "⬅️ Back to top" 9 | url = "/.." 10 | weight = 1 11 | 12 | [[menu.after]] 13 | name = "Collaborate on Github" 14 | url = "https://github.com/uapi-group/specifications" 15 | weight = 11 16 | 17 | [markup.goldmark.renderer] 18 | unsafe = true # Allow HTML in md files 19 | 20 | [params] 21 | BookPortableLinks = true 22 | BookSection = '*' 23 | BookRepo = 'https://github.com/uapi-group/specifications' 24 | BookCommitPath = 'commit' 25 | BookEditLink = '{{ .Site.Params.BookRepo }}/edit/main/{{ .Path }}' 26 | BookDateFormat = 'Jan 2, 2006' 27 | BookIndexPage = 'README.md' 28 | BookTheme = 'auto' 29 | 30 | [modules] 31 | [[module.mounts]] 32 | source = 'content' 33 | target = 'content' 34 | [[module.mounts]] 35 | source = '../README.md' 36 | target = 'content/_index.md' 37 | [[module.mounts]] 38 | source = '../specs' 39 | target = 'content/specs' 40 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | # Static website generation for UAPI group specifications 2 | 3 | This repository uses Hugo for static HTML generation. 4 | See https://gohugo.io/getting-started/quick-start/ for a brief intro. 5 | 6 | The website uses the [hugo-book](https://github.com/alex-shpak/hugo-book) theme; it is included in this repo as a git submodule. 7 | After cloning this repo please run `git submodule init; git submodule update`. 8 | If you check out a branch or tag, make sure the submodule is up to date by running `git submodule update`. 9 | 10 | ## Website repo layout 11 | 12 | Content resides in the [content](content/) folder. 13 | Top-level intro pages and menu entries should be put there. 14 | The repository's [specs](../specs) folder is soft-linked there (as [content/docs](content/docs)). 15 | New specs are automatically added to the navigation menu on the left. 16 | Optionally, the ([index](content/_index.md) page may be updated to reference / feature important specs. 17 | 18 | ## Making changes and testing 19 | 20 | You'll need [hugo installed](https://gohugo.io/getting-started/installing/) for rendering changes. 21 | 22 | First, make your edits. 23 | Then, start hugo locally (in the repo's `website` directory)to review your changes: 24 | 25 | ```shell 26 | $ hugo server --minify --disableFastRender 27 | ``` 28 | 29 | Review your changes at http://localhost:1313/specifications/ . 30 | -------------------------------------------------------------------------------- /specs/discoverable_disk_image.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.3 Discoverable Disk Images 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 3 8 | aliases: 9 | - /UAPI.3 10 | - /3 11 | --- 12 | # UAPI.3 Discoverable Disk Images (DDI) 13 | 14 | | Version | Changes | 15 | |---------|-----------------| 16 | | 1.0 | Initial Release | 17 | 18 | DDIs (Discoverable Disk Images) are self-describing file system images that follow the DPS ([Discoverable 19 | Partitions Specification](discoverable_partitions_specification.md)), wrapped in a GPT partition table, that 20 | may contain root (or `/usr/`) filesystems for bootable OS images, system extensions, configuration 21 | extensions, portable services, containers and more, and shall be protected by signed `dm-verity` all combined 22 | into one. They are designed to be composable and stackable, and provide security by default. 23 | 24 | ## Image Format 25 | The images use the GPT partition table verbatim, so it will not be redefined here. Each partition contains 26 | a standard Linux filesystem (e.g.: `erofs`), so again this will not be redefined here. 27 | The [DPS](discoverable_partitions_specification.md) defines the GUIDs to use and the format of the 28 | `dm-verity` signature partition's JSON content. 29 | 30 | It is recommended to use a sector size of 512 bytes or 4096 for DDIs. Software operating with DDIs should 31 | automatically derive the sector size used for a DDI by looking for the `EFI PART` magic string at offsets 512 32 | or 4096, as per GPT specification. 33 | 34 | ## Naming 35 | 36 | DDIs should use `.raw` as file suffix. A secondary suffix may be used to clarify the specific usage class of 37 | a DDI. For now the two secondary suffixes `.sysext.raw` and `.confext.raw` are defined (for system extension 38 | DDIs and configuration extension DDIs, see [Extension 39 | Images](https://uapi-group.org/specifications/specs/extension_image) for details). 40 | 41 | The MIME type for DDIs is `application/vnd.efi.img`, [as per 42 | IANA](https://www.iana.org/assignments/media-types/application/vnd.efi.img). 43 | 44 | ## Image Version 45 | If the DDI is versioned, the version format described in the 46 | [Version Format Specification](version_format_specification.md) must be used. The underscore character (`_`) 47 | must be used to separate the version from the name of the image. For example: `foo_1.2.raw` denotes a `foo` 48 | DDI with version `1.2`. 49 | -------------------------------------------------------------------------------- /specs/configuration_files_specification.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.6 Configuration Files Specification 3 | layout: posts 4 | version: 1.0 5 | weight: 6 6 | aliases: 7 | - /UAPI.6 8 | - /6 9 | --- 10 | 11 | # UAPI.6 Configuration Files Specification 12 | 13 | | Version | Changes | 14 | |---------|-----------------| 15 | | 1.0 | Initial Release | 16 | 17 | ## Introduction 18 | 19 | Various specifications attempt to define configuration files and file formats. This 20 | specification establishes where these files should be looked for, in which 21 | order, and how precedence, masking, extensions and overrides work. 22 | 23 | The purpose of the rules defined here is to allow OS vendors to implement the 24 | hermetic-usr pattern, where all vendor files are shipped in the vendor tree itself 25 | (`/usr/`), including configuration files with system defaults, while allowing local 26 | or vendor overrides without modifying the original files, for easier management. This is 27 | especially beneficial for image-based deployments, where the vendor tree is read-only. 28 | 29 | These rules are derived from existing real-world usage from the [systemd 30 | project](https://github.com/systemd/systemd) and the [libeconf 31 | project](https://github.com/openSUSE/libeconf). It is highly recommended to use libeconf, 32 | which is readily available and maintained in all major distributions, rather than 33 | reimplementing this specification locally. 34 | 35 | This specification is agnostic toward the actual format and content of the configuration 36 | files, the precise path used under each top-level hierarchy, and also toward the 37 | filenames and extensions, with the exception of the `.d/` notation for drop-ins 38 | directories. It is strongly encouraged to enforce a specific suffix for the configuration 39 | files, in order to disambiguate (e.g.: with backup files), but the choice of which suffix 40 | to use is left to each implementation. 41 | 42 | ## Storage Directories and Overrides 43 | In order to allow shipping system defaults owned by the OS vendor, while at the same 44 | time letting local users or admins override those defaults, `/usr/` and `/etc/` are both 45 | supported for storage of configuration files, with the latter having higher priority. 46 | The precise location under `/usr/` is left open for the implementation to decide - it 47 | could be hard-coded to `/usr/lib/` or it could be left to each application to pick from 48 | various options, such as `/usr/share/` or `/usr/etc/`. 49 | Programs must work correctly if no configuration files are found in `/etc/`. 50 | Optionally, `/run/` is also supported for ephemeral overrides. 51 | 52 | For example, `/usr/lib/foo/bar.conf` provides the default configuration file. 53 | If `/run/foo/bar.conf` is present and supported, it would take precedence over 54 | `/usr/lib/foo/bar.conf`. 55 | Finally, a user can create `/etc/foo/bar.conf` which would take precedence and 56 | completely override both. 57 | 58 | ## Masking 59 | As a special override case, it must be possible to mask files across different 60 | locations by creating a symlink to `/dev/null` or an empty file. 61 | 62 | For example, an empty `/etc/foo/bar.conf` means that `/usr/lib/foo/bar.conf` is 63 | masked and thus not parsed. 64 | 65 | ## Drop-ins 66 | All configuration paths must support drop-ins, 67 | except for configuration file formats where automatic combining of multiple files is not feasible, 68 | for example scripts or structured documents. 69 | Supporting drop-ins means that in addition to parsing a full configuration file, 70 | an implementation also parses the drop-in files in the drop-in directories associated with it. 71 | 72 | Drop-ins always have higher precedence than the configuration file they refer to. 73 | Drop-ins are sorted in the lexicographic order using the file name without the path, 74 | regardless of the hierarchy under which they are stored. 75 | The drop-ins that are later in this order have higher precedence. 76 | 77 | Considering the following files are present on the filesystem, this would be the order in which the 78 | files are parsed. Note, that files with the same name override each other. The configuration in 79 | `bar.conf` has the lowest priority, and is read before `a.conf` and `b.conf`. `b.conf` has the 80 | highest priority: 81 | 82 | - ~~`/usr/lib/foo/bar.conf`~~ (overridden by `/etc/foo/bar.conf`) 83 | - `/etc/foo/bar.conf` 84 | - ~~`/usr/lib/foo/bar.conf.d/a.conf`~~ (overridden by `/etc/foo/bar.conf.d/a.conf`) 85 | - `/etc/foo/bar.conf.d/a.conf` 86 | - `/usr/lib/foo/bar.conf.d/b.conf` 87 | 88 | If a config file is masked, drop-ins must still be parsed, unless they are masked 89 | themselves. 90 | 91 | For example, even if `/usr/lib/foo/bar.conf` is masked by an empty `/etc/foo/bar.conf`, 92 | `/usr/lib/foo/bar.conf.d/a.conf` must still be parsed and applied, unless there is also 93 | an empty `/etc/foo/bar.conf.d/a.conf`, in which case the drop-in is masked too. 94 | 95 | Drop-ins are not recursive, so a drop-in cannot have a directory of drop-ins. 96 | 97 | For example, `/etc/foo/bar.conf.d/a.conf` cannot be overridden by 98 | `/etc/foo/bar.conf.d/a.conf.d/b.conf`, and the latter must be ignored if it exists. 99 | 100 | ### Drop-ins without Main Configuration File 101 | Optionally, schemes with only drop-ins, without a 'main' configuration file, should also 102 | be supported by implementations. In such schemes many drop-ins are loaded from a common 103 | directory in each hierarchy. 104 | 105 | For example, `/usr/lib/foo.d/a.conf`, `/usr/lib/foo.d/b.conf` and `/etc/foo.d/c.conf` 106 | are all loaded and parsed in this scheme, in this order. 107 | -------------------------------------------------------------------------------- /specs/vmgenid.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.14 VMGenID 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 14 8 | aliases: 9 | - /UAPI.14 10 | - /14 11 | --- 12 | 13 | # UAPI.14 VMGenID: Virtual Machine Generation ID 14 | 15 | | Version | Changes | 16 | |---------|-----------------| 17 | | 1.0 | Initial Release | 18 | 19 | Virtual machine operations that restore a VM to an earlier point in time (such as applying snapshots, restoring from backup, cloning, or failover scenarios) can cause serious problems for applications that depend on unique identifiers or cryptographic entropy. The Virtual Machine Generation ID (VMGenID) device provides a mechanism for guest software to detect when such operations have occurred. 20 | 21 | The VMGenID is a 128-bit cryptographically random identifier that changes whenever a virtual machine is cloned or restored to an earlier state. This allows applications to detect such events and take appropriate protective measures, such as reseeding random number generators, regenerating unique identifiers, or invalidating cached state. 22 | 23 | ## The vmgenid_abi Structure 24 | 25 | The hypervisor provides 16 bytes in shared memory containing the generation ID. The structure can be represented as two little-endian 64-bit values, and must be placed in an 8-byte aligned buffer. 26 | 27 | ### Structure Fields 28 | 29 | | Offset | Field | Description | 30 | |--------|-------------------------------|--------------------------------------------| 31 | | 0x00 | `uint64_t generation_id_low` | Lower 64 bits of the 128-bit generation ID | 32 | | 0x08 | `uint64_t generation_id_high` | Upper 64 bits of the 128-bit generation ID | 33 | 34 | The generation ID is a 128-bit cryptographically random value that is unique across all VM instances and time. All 128 bits are random; it is *not* a Version 4 UUID. 35 | 36 | The generation ID changes whenever the VM is restored to an earlier or non-unique state: 37 | 38 | - Snapshot restoration 39 | - Backup recovery 40 | - VM cloning/copying/import 41 | - Disaster recovery failover 42 | 43 | The generation ID remains constant during normal VM operations: 44 | 45 | - Pause/resume 46 | - Shutdown/restart/reboot 47 | - Host reboot or upgrade 48 | - Live migration or lossless online failover 49 | 50 | Events, such as live migrations, which merely disrupt the VM's clock without changing the uniqueness of its identity do not result in a change to the generation ID. Conversely, cloning (forking) a running VM running on the same host would result in a new generation ID without disrupting the timekeeping. Guests which want to detect clock disruption should use the [VMClock device](vmclock.md) for that purpose. 51 | 52 | ### GUID interoperability 53 | 54 | If the generation ID is represented as a GUID for the purpose of storage or configuration by a Virtual Machine Monitor, it is recommended that: 55 | 56 | - The generation ID shared to the guest is the little-endian representation of that GUID 57 | - The textual representation of the GUID, in display or configuration, is the RFC 4122 standard big-endian form 58 | 59 | 60 | ## Discovery via ACPI 61 | 62 | To expose VMGenID to the operating system via ACPI, the firmware or hypervisor must: 63 | 64 | 1. Place the shared `vmgenid_abi` structure somewhere in RAM, ROM or device memory space, which is guaranteed not to be used by the operating system. It must not be in ranges reported as `AddressRangeMemory` or `AddressRangeACPI`, and must not be in the same page as any memory which is expected to be mapped by a page table entry with caching disabled. 65 | 66 | 2. Expose a device somewhere in the ACPI namespace with: 67 | - a hardware ID (`_HID`) that is hypervisor-specific 68 | - a DOS Device Name ID (`_DDN`) of "VM_Gen_Counter" 69 | - a compatible ID (`_CID`) of "VM_Gen_Counter" 70 | 71 | 3. Attach to the device an `ADDR` method which when evaluated returns the 64-bit physical address of the generation ID structure as a package containing the low and high 32-bit address components in that order. 72 | 73 | 4. After the generation ID changes, the device shall raise an ACPI Notify operation using notification code 0x80. The device may raise the notify operation even if the generation ID has not changed. 74 | 75 | ## Discovery via Device Tree 76 | 77 | The firmware or hypervisor must place the `vmgenid_abi` structure in an otherwise unused region of physical memory and advertise its presence to the operating system. The Device Tree binding for the `microsoft,vmgenid` node is as follows: 78 | 79 | ```yaml 80 | # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 81 | %YAML 1.2 82 | --- 83 | $id: http://devicetree.org/schemas/rng/microsoft,vmgenid.yaml# 84 | $schema: http://devicetree.org/meta-schemas/core.yaml# 85 | 86 | title: Virtual Machine Generation ID 87 | 88 | maintainers: 89 | - Jason A. Donenfeld 90 | 91 | description: 92 | Firmwares or hypervisors can use this devicetree to describe an 93 | interrupt and a shared resource to inject a Virtual Machine Generation ID. 94 | Virtual Machine Generation ID is a globally unique identifier (GUID) and 95 | the devicetree binding follows VMGenID specification. 96 | 97 | properties: 98 | compatible: 99 | const: microsoft,vmgenid 100 | 101 | reg: 102 | description: 103 | Specifies a 16-byte VMGenID in endianness-agnostic hexadecimal format. 104 | maxItems: 1 105 | 106 | interrupts: 107 | description: 108 | Interrupt used to notify that a new VMGenID is available. 109 | maxItems: 1 110 | 111 | required: 112 | - compatible 113 | - reg 114 | - interrupts 115 | 116 | additionalProperties: false 117 | 118 | examples: 119 | - | 120 | #include 121 | rng@80000000 { 122 | compatible = "microsoft,vmgenid"; 123 | reg = <0x80000000 0x1000>; 124 | interrupts = ; 125 | }; 126 | ``` 127 | -------------------------------------------------------------------------------- /specs/version_format_specification.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.10 Version Format Specification 3 | layout: default 4 | version: 1.0 5 | SPDX-License-Identifier: CC-BY-4.0 6 | weight: 10 7 | aliases: 8 | - /UAPI.10 9 | - /10 10 | --- 11 | 12 | # UAPI.10 Version Format Specification 13 | 14 | | Version | Changes | 15 | |---------|-----------------| 16 | | 1.0 | Initial Release | 17 | 18 | This specification defines the format of version strings and their ordering. 19 | Various documents that are part of the [uapi-group specification suite](https://uapi-group.org/specifications), 20 | as well as other projects like [systemd](https://systemd.io), 21 | rely on this specification to have 22 | a sort order for strings that include version components, 23 | and use it for various purposes, 24 | such as choosing the default boot entry in the [Boot Loader Specification](boot_loader_specification.md). 25 | 26 | ## Version Format 27 | The version string is a sequence of zero or more characters. 28 | 29 | The following characters have special meaning: 30 | - ASCII digits (`0-9`) form numerical components. 31 | - ASCII letters (`a-z`, `A-Z`) form alphabetical components. 32 | - Dot (`.`) separates parts of a component. 33 | - Minus (`-`) separates major parts of the version string. 34 | - Tilde (`~`) starts a suffix that always sorts lower. 35 | - Caret (`^`) starts a suffix that always sorts higher. 36 | 37 | Other characters are treated as separators. 38 | This includes plus (`+`) and underscore (`_`) and other printable or non-printable characters. 39 | The underscore MAY be used. 40 | The plus SHOULD NOT be used, to avoid confusion with SEMVER which attaches a special meaning to it. 41 | Other characters MUST NOT be used in a version string. 42 | 43 | Note that in some contexts (for example [the DDI specification](discoverable_disk_image.md) and DEB 44 | package file names), the underscore is used as a separator and cannot be used freely in the version 45 | string. 46 | 47 | ## Version Comparison 48 | 49 | The following method should be used to compare version strings. The algorithm 50 | is based on rpm's `rpmvercmp()`, but not identical. 51 | 52 | Both strings are compared from the beginning until the end, or until the 53 | strings are found to compare as different. In a loop: 54 | 1. Any characters which are outside of the set of listed above (`a-z`, `A-Z`, `0-9`, `-`, `.`, `~`, `^`) 55 | are skipped in both strings. In particular, this means that non-ASCII characters 56 | that are Unicode digits or letters are skipped too. 57 | 2. If the remaining part of one of strings starts with `~`: 58 | if other remaining part does not start with `~`, 59 | the string with `~` compares lower. Otherwise, both tilde characters are skipped. 60 | 3. If one of the strings has ended: if the other string hasn't, the string that 61 | has remaining characters compares higher. Otherwise, the strings compare 62 | equal. 63 | 4. If the remaining part of one of strings starts with `-`: 64 | if the other remaining part does not start with `-`, 65 | the string with `-` compares lower. Otherwise, both minus characters are skipped. 66 | 5. If the remaining part of one of strings starts with `^`: 67 | if the other remaining part does not start with `^`, 68 | the string with `^` compares lower. Otherwise, both caret characters are skipped. 69 | 6. If the remaining part of one of strings starts with `.`: 70 | if the other remaining part does not start with `.`, 71 | the string with `.` compares lower. Otherwise, both dot characters are skipped. 72 | 7. If either of the remaining parts starts with a digit: numerical prefixes are 73 | compared numerically. Any leading zeroes are skipped. 74 | The numerical prefixes (until the first non-digit character) are evaluated as numbers. 75 | If one of the prefixes is empty, it evaluates as 0. 76 | If the numbers are different, the string with the bigger number compares higher. 77 | Otherwise, the comparison continues at the following characters at point 1. 78 | 8. Leading alphabetical prefixes are compared alphabetically. 79 | The substrings are compared letter-by-letter. 80 | If both letters are the same, the comparison continues with the next letter. 81 | All capital letters compare lower than lower-case letters (`B < a`). 82 | When the end of one substring has been reached (a non-letter character or the end 83 | of the whole string), if the other substring has remaining letters, it compares higher. 84 | Otherwise, the comparison continues at the following characters at point 1. 85 | 86 | ## Comparison with Other Specifications 87 | Other specifications exist to mandate version formats: 88 | 89 | - [RPM Packaging Guidelines](https://docs.fedoraproject.org/en-US/packaging-guidelines/Versioning/) 90 | - [Debian Policy](https://www.debian.org/doc/debian-policy/ch-controlfields.html#version) 91 | - [Semantic Versioning](https://semver.org/) 92 | 93 | All of these, including the present document, share some commonalities but are also 94 | incompatible in some ways, as they all evolved in different environments. The main 95 | differences are as follows. 96 | 97 | - to separate components DEB uses `_`, RPM uses `-` with positional logic (it assumes different meaning in different positions), and SemVer does not specify anything as it is concerned only with the version part of the string 98 | - to identify a pre-release suffix RPM and DEB use `~` and SemVer uses `-` 99 | - to identify a rebuild suffix DEB uses `+`, SemVer uses `.`, and RPM increases the `release` part of the version 100 | - to identify an epoch prefix DEB and RPM use `:`, and SemVer does not specify anything 101 | 102 | ## Examples 103 | Examples (with '' meaning the empty string): 104 | 105 | * `11 == 11` 106 | * `systemd-123 == systemd-123` 107 | * `bar-123 < foo-123` 108 | * `123a > 123` 109 | * `123.a > 123` 110 | * `123.a < 123.b` 111 | * `123a > 123.a` 112 | * `11α == 11β` 113 | * `B < a` 114 | * '' < `0` 115 | * `0.` > `0` 116 | * `0.0` > `0` 117 | * `0` > `~` 118 | * '' > `~` 119 | * `1_` == `1` 120 | * `_1` == `1` 121 | * `1_` < `1.2` 122 | * `1_2_3` > `1.3.3` 123 | * `1+` == `1` 124 | * `+1` == `1` 125 | * `1+` < `1.2` 126 | * `1+2+3` > `1.3.3` 127 | 128 | Note how in the `1_2_3` > `1.3.3` and `1+2+3` > `1.3.3` cases, the underscore and plus characters act as 129 | separators between components, so we first compare `1` with `1.3.3` as numerical version strings, and 130 | `1` < `1.3.3`. The remainder of the first string is not used in the comparison. 131 | 132 | * `122.1` < `123~rc1-1` < `123` < `123-a` < `123-a.1` < `123-1` < `123-1.1` < `123^post1` < `123.a-1` < `123.1-1` < `123a-1` < `124-1` 133 | 134 | In the above example each entry compares smaller than every entry to its right and equal only to itself, 135 | conversely each entry compares larger to every entry to its left and compares unequal to all except itself. 136 | 137 | ## Notes 138 | [systemd-analyze](https://www.freedesktop.org/software/systemd/man/systemd-analyze.html) 139 | implements this version comparison algorithm as 140 | ``` 141 | systemd-analyze compare-versions 142 | ``` 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI Group Specifications 3 | BookToC: false 4 | --- 5 | 6 | # UAPI Group Specifications 7 | 8 | The following specifications have been accepted by the UAPI group: 9 | 10 | * [UAPI.1 Boot Loader Specification](specs/boot_loader_specification.md): 11 | Defines a set of file formats and naming conventions to allow distribution independent boot loader menus supportable by multiple bootloaders. 12 | ([canonical online location](https://uapi-group.org/specifications/specs/boot_loader_specification/)) 13 | * [UAPI.2 Discoverable Partitions Specification](specs/discoverable_partitions_specification.md): 14 | Discusses GUID UUIDs for auto-discovery of partition semantics and mount points. 15 | ([canonical online location](https://uapi-group.org/specifications/specs/discoverable_partitions_specification/)) 16 | * [UAPI.3 Discoverable Disk Images](specs/discoverable_disk_image.md): 17 | Describes the Discoverable Disk Image format for self-describing system images. 18 | ([canonical online location](https://uapi-group.org/specifications/specs/discoverable_disk_image/)) 19 | * [UAPI.4 Extension Images](specs/extension_image.md): 20 | Describes the use of Discoverable Disk Images to create extensions to a base image. 21 | ([canonical online location](https://uapi-group.org/specifications/specs/extension_image/)) 22 | * [UAPI.5 Unified Kernel Images](specs/unified_kernel_image.md): 23 | Describes the use of UEFI PE binaries to provide a Unified Kernel Image containing the kernel, initrd, command line, and other components. 24 | ([canonical online location](https://uapi-group.org/specifications/specs/unified_kernel_image/)) 25 | * [UAPI.6 Configuration Files Specification](specs/configuration_files_specification.md): 26 | Standardises default locations and environment variables for locating common files or base directories. 27 | This is derived from, and extends, the [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/), 28 | to allow for separation between vendor and admin configuration files, drop-in files, and masking. 29 | ([canonical online location](https://uapi-group.org/specifications/specs/configuration_files_specification/)) 30 | * [UAPI.7 Linux TPM PCR Registry](specs/linux_tpm_pcr_registry.md): 31 | An informative list of how TPM PCRs are used on a Linux system. 32 | ([canonical online location](https://uapi-group.org/specifications/specs/linux_tpm_pcr_registry/)) 33 | * [UAPI.8 Package Metadata for Executable Files](specs/package_metadata_for_executable_files.md): 34 | Describes the format and mechanism to include packaging metadata in ELF/PE binaries. 35 | ([canonical online location](https://uapi-group.org/specifications/specs/package_metadata_for_executable_files/)) 36 | * [UAPI.9 Linux File System Hierarchy](specs/linux_file_system_hierarchy.md): 37 | Describes the layout of directories and files in an installation of Linux 38 | ([canonical online location](https://uapi-group.org/specifications/specs/linux_file_system_hierarchy/)) 39 | * [UAPI.10 Version Format Specification](specs/version_format_specification.md): 40 | Defines semantics of version strings used in the other specifications listed here. 41 | ([canonical online location](https://uapi-group.org/specifications/specs/version_format_specification/)) 42 | * [UAPI.11 File Hierarchy for the Verification of OS Artifacts (VOA)](specs/file_hierarchy_for_the_verification_of_os_artifacts.md): 43 | Describes the use of Discoverable Disk Images to create extensions to a base image. 44 | ([canonical online location](https://uapi-group.org/specifications/specs/file_hierarchy_for_the_verification_of_os_artifacts/)) 45 | * [UAPI.12 dlopen() Metadata for ELF Files](specs/elf_dlopen_metadata.md): 46 | Describes the format and mechanism to include dynamically loaded libraries metadata in ELF binaries. 47 | ([canonical online location](https://uapi-group.org/specifications/specs/elf_dlopen_metadata/)) 48 | * [UAPI.13 Efficient Time Synchronisation for Virtual Machines](specs/vmclock.md): 49 | Describes the format and mechanism to synchronize the guest clock. 50 | ([canonical online location](https://uapi-group.org/specifications/specs/vmclock/)) 51 | * [UAPI.14 Virtual Machine Generation ID](specs/vmgenid.md): 52 | Describes the mechanism for detecting virtual machine rollback events. 53 | ([canonical online location](https://uapi-group.org/specifications/specs/vmgenid/)) 54 | * [UAPI.15 OSC 3008: Hierarchical Context Signalling](specs/osc_context.md): 55 | Defines a mechanism for terminal emulators to follow the context hierarchy of what's on screen. 56 | ([canonical online location](https://uapi-group.org/specifications/specs/osc_context/)) 57 | 58 | ## Work in Progress 59 | 60 | See [open PRs on github](https://github.com/uapi-group/specifications/pulls?q=is%3Apr+is%3Aopen+sort%3Aupdated-desc). 61 | 62 | ## License 63 | 64 | All specifications are licensed under [CC-BY-4.0](https://spdx.org/licenses/CC-BY-4.0.html). 65 | 66 | ## Versioning 67 | 68 | All specifications are versioned. 69 | 70 | The versioning format is MAJOR.MINOR. 71 | Compatible changes increment the MINOR version. 72 | Incompatible changes increment the MAJOR version and reset the MINOR version to `0`. 73 | 74 | Work in progress specifications have a MAJOR version of `0`. 75 | 76 | A `filename/MAJOR.MINOR` git tag will be created when a new version of a given spec is released. 77 | 78 | ## Glossary 79 | 80 | This section clarifies on terms and abbreviations used in specs and other documents. 81 | 82 | ## General terms and abbreviations 83 | - *ELF* – Executable and Linkable Format (Linux executable binary format) 84 | - *MOK* – Machine Owner Key (shim) 85 | - *PCR* – TPM Platform Configuration Registers 86 | - *PE* – Portable Executable (UEFI executable binary format) 87 | - *SBAT* – UEFI Secure Boot Advanced Targeting 88 | - *TPM* – Trusted Platform Module (security chip) 89 | 90 | ## Terms and abbreviations specific to UAPI group specifications 91 | - [*BLS*](specs/boot_loader_specification.md) - Boot Loader Specification 92 | - [*confext*](specs/extension_image.md) – Configuration Extension Image 93 | (type of DDI that is overlayed on top of `/etc/` via overlayfs and can extend the underlying OS' configuration in a composable, immutable fashion) 94 | - [*DDI*](specs/discoverable_disk_image.md) - Discoverable Disk Image 95 | - [*DPS*](specs/discoverable_partitions_specification.md) - Discovery Partition Specification 96 | - [*sysext*](specs/extension_image.md) – System Extension Image 97 | (type of DDI that is overlayed on top of `/usr/` and `/opt/` via overlayfs and can extend the underlying OS vendor resources in a composable, immutable fashion) 98 | - [*UKI*](specs/unified_kernel_image.md) – Unified Kernel Images (UEFI boot stub + kernel + initrd + more) 99 | - [*VMClock*](specs/vmclock.md) – Virtual Machine Clock (efficient time synchronisation for virtual machines) 100 | - [*VMGenID*](specs/vmgenid.md) – Virtual Machine Generation ID (mechanism for detecting VM rollback events) 101 | - [*VOA*](specs/file_hierarchy_for_the_verification_of_os_artifacts.md) – Verification of OS Artifacts 102 | 103 | ## Participate 104 | 105 | Please use the [specifications issue tracker](https://github.com/uapi-group/specifications/issues) to engage with the project. 106 | -------------------------------------------------------------------------------- /specs/elf_dlopen_metadata.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.12 dlopen() Metadata for ELF Files 3 | category: Interfaces 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 12 8 | aliases: 9 | - /UAPI.12 10 | - /12 11 | --- 12 | 13 | # UAPI.12 `dlopen()` Metadata for ELF Files 14 | 15 | | Version | Changes | 16 | |---------|-----------------| 17 | | 1.0 | Initial Release | 18 | 19 | ## Target Audience 20 | 21 | The target audience for this specification is: 22 | 23 | * Developers working on userspace subsystems that create ELF binaries that dynamically load libraries 24 | * Developers working on userspace subsystems that package ELF binaries that dynamically load libraries 25 | 26 | ## Motivation 27 | 28 | Using `dlopen()` to load optional dependencies brings several advantages: programs can gracefully downgrade 29 | a feature when a library is not available, and the shared library is only loaded into the process (and its 30 | ELF constructors are run) only when the requested feature is actually used. But it also has some drawbacks, 31 | and the main one is that it is harder to track a program's dependencies, since unlike build-time dynamic 32 | linking there will not be a mention in the ELF metadata. This specification aims to solve this problem by 33 | providing a standardized specification for a custom ELF note that can be used to list `dlopen()` 34 | dependencies. 35 | 36 | ## Implementation 37 | 38 | This document will attempt to define a common metadata format specification, so that multiple implementers 39 | might use it when coding upstream software, and packagers might use it when building packages and setting 40 | dependencies. 41 | 42 | The metadata will be embedded in a series of new, 4-byte-aligned, allocated, 0-padded, read-only ELF header 43 | sections, in a JSON array containing name-value objects, either one ELF note per dependency or as a single 44 | note listing multiple dependencies in the top-level array. Implementers working on parsing ELF files should 45 | not assume a specific list of names, but parse anything that is included in the section, and should look for 46 | the note using the `note type`. Implementers working on build tools should strive to use the same names, for 47 | consistency. The most common will be listed here. 48 | 49 | * Section header 50 | 51 | ``` 52 | SECTION: `.note.dlopen` 53 | note type: `0x407c0c0a` 54 | Owner: `FDO` (FreeDesktop.org) 55 | Value: an array of JSON objects encoded as a zero-terminated UTF-8 string 56 | ``` 57 | 58 | * JSON payload 59 | 60 | ```json 61 | [ 62 | { 63 | "soname": ["libfoo.so.1"], 64 | "feature": "foo", 65 | "description": "Enables the foo feature", 66 | "priority": "recommended" 67 | } 68 | ] 69 | ``` 70 | 71 | The format is a single JSON array containing objects, encoded as a zero-terminated `UTF-8` string. Each key 72 | in each object shall be unique as per recommendations of [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259#section-4). 73 | Strings shall not contain any control characters or use `\uXXX` escaping. 74 | 75 | Reference implementations of [packaging tools for `.deb` and `.rpm`](https://github.com/systemd/package-notes) 76 | are available, and provide macros/helpers to parse the note when building packages and adding dependencies. 77 | 78 | ## Well-known keys 79 | 80 | The metadata format is intentionally extensible, so that upstreams and later revisions of this spec can add 81 | their own information. The 'soname' array is required, with at least one element, everything else is 82 | optional. If alternative soname versions for the same library are supported at the same time, an array can 83 | be used, listing the most preferred first, and parsers are expected to select only the first one that is 84 | available on the system, as it is a mechanism to specify alternatives. If the `priority` field is used, it 85 | must follow the specification and use one of the values specified in the table. If it is not specified, a 86 | parser should assume 'recommended' if a priority is needed. If the `feature` field is used, it will identify 87 | an individual feature, and multiple entries using the same `feature` denote functionality that requires all 88 | of the libraries they specify in order to be enabled. 89 | 90 | | Key name | Key type | Mandatory | Key description | Example value | 91 | |-------------|----------------------------|-----------|--------------------------------------------------------------------------|----------------------------------| 92 | | soname | array of strings | yes | The library names loaded by `dlopen()` | [ "libfoo.so.1", "libfoo.so.0" ] | 93 | | feature | string | no | A keyword identifying the feature that the library contributes to enable | "foo" | 94 | | description | string | no | A human-readable text string describing the feature | "Enables the foo feature" | 95 | | priority | string | no | The priority of the feature, one of: required, recommended, suggested | "recommended" | 96 | 97 | ### Priority definition 98 | 99 | | Priority | Semantics | 100 | |-------------|--------------------------------------------------------------------------------------------------------------------------------------| 101 | | required | Core functionality needs the dependency, the binary will not work if it cannot be found | 102 | | recommended | Important functionality needs the dependency, the binary will work but in most cases the dependency should be provided | 103 | | suggested | Secondary functionality needs the dependency, the binary will work and the dependency is only needed for full-featured installations | 104 | 105 | ### Displaying `dlopen()` notes 106 | 107 | The raw ELF section can be extracted using `objdump`: 108 | ```console 109 | $ objdump -j .note.dlopen -s /usr/lib64/systemd/libsystemd-shared-257.so 110 | 111 | /usr/lib64/systemd/libsystemd-shared-257.so: file format elf64-x86-64 112 | 113 | Contents of section .note.dlopen: 114 | 0334 04000000 8e000000 0a0c7c40 46444f00 ..........|@FDO. 115 | 0344 5b7b2266 65617475 7265223a 22627066 [{"feature":"bpf 116 | 0354 222c2264 65736372 69707469 6f6e223a ","description": 117 | 0364 22537570 706f7274 20666972 6577616c "Support firewal 118 | 0374 6c696e67 20616e64 2073616e 64626f78 ling and sandbox 119 | 0384 696e6720 77697468 20425046 222c2270 ing with BPF","p 120 | 0394 72696f72 69747922 3a227375 67676573 riority":"sugges 121 | 03a4 74656422 2c22736f 6e616d65 223a5b22 ted","soname":[" 122 | 03b4 6c696262 70662e73 6f2e3122 2c226c69 libbpf.so.1","li 123 | 03c4 62627066 2e736f2e 30225d7d 5d000000 bbpf.so.0"]}]... 124 | 03d4 04000000 9e000000 0a0c7c40 46444f00 ..........|@FDO. 125 | ... 126 | ``` 127 | 128 | It is more convenient to use a higher level tool: 129 | ```console 130 | $ dlopen-notes /usr/lib64/systemd/libsystemd-shared-257.so 131 | # /usr/lib64/systemd/libsystemd-shared-257.so 132 | [ 133 | { 134 | "feature": "archive", 135 | "description": "Support for decompressing archive files", 136 | "priority": "suggested", 137 | "soname": [ 138 | "libarchive.so.13" 139 | ] 140 | }, 141 | { 142 | "feature": "bpf", 143 | "description": "Support firewalling and sandboxing with BPF", 144 | "priority": "suggested", 145 | "soname": [ 146 | "libbpf.so.1", 147 | "libbpf.so.0" 148 | ] 149 | }, 150 | ... 151 | ``` 152 | 153 | `dlopen-notes` can display the notes grouped in a few different ways. 154 | One option is to filter the libraries by "feature". This answers the 155 | question "what libraries are needed to provide specified features": 156 | 157 | ```console 158 | $ dlopen-notes.py -f archive,bpf /usr/lib64/systemd/libsystemd-shared-257.so 159 | # grouped by feature 160 | { 161 | "bpf": { 162 | "description": "Support firewalling and sandboxing with BPF", 163 | "sonames": { 164 | "libbpf.so.1": "suggested", 165 | "libbpf.so.0": "suggested" 166 | } 167 | }, 168 | "archive": { 169 | "description": "Support for decompressing archive files", 170 | "sonames": { 171 | "libarchive.so.13": "suggested" 172 | } 173 | } 174 | } 175 | ``` 176 | 177 | The format that is used when building `deb` packages: 178 | ```console 179 | $ dlopen-notes -s /usr/lib64/systemd/libsystemd-shared-257.so 180 | libarchive.so.13 suggested 181 | libbpf.so.0 suggested 182 | libbpf.so.1 suggested 183 | ... 184 | ``` 185 | 186 | The format that can be useful when building `rpm` packages: 187 | ```console 188 | $ dlopen-notes --rpm-requires archive --rpm-recommends bpf /usr/lib64/systemd/libsystemd-shared-257.so 189 | Requires: libarchive.so.13()(64bit) 190 | Recommends: libbpf.so.1()(64bit) 191 | ``` 192 | -------------------------------------------------------------------------------- /specs/package_metadata_for_executable_files.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.8 Package Metadata for Executable Files 3 | category: Interfaces 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 8 8 | aliases: 9 | - /UAPI.8 10 | - /8 11 | --- 12 | 13 | # UAPI.8 Package Metadata for Executable Files 14 | 15 | | Version | Changes | 16 | |---------|-----------------| 17 | | 1.0 | Initial Release | 18 | 19 | ## Target Audience 20 | 21 | The target audience for this specification is: 22 | 23 | * Developers working on userspace subsystems that create or manipulate ELF or PE/COFF binaries 24 | * Developers working on userspace subsystems that parse core files 25 | 26 | ## Motivation 27 | 28 | ELF binaries get stamped with a unique, build-time generated hex string identifier called `build-id`, 29 | [which gets embedded as an ELF note called `.note.gnu.build-id`](https://fedoraproject.org/wiki/Releases/FeatureBuildId). 30 | In most cases, this allows a stripped binary to be associated with its debugging information. 31 | It is used, for example, to dynamically fetch DWARF symbols from a debuginfo server, or 32 | to query the local package manager and find out the package metadata or, again, the DWARF 33 | symbols or program sources. 34 | 35 | However, this usage of the `build-id` requires either local metadata, usually set up by 36 | the package manager, or access to a remote server over the network. Both of those might 37 | be unavailable or forbidden. 38 | 39 | Thus it becomes desirable to add additional metadata to a binary at build time, so that 40 | `systemd-coredump` and other services analyzing core files are able to extract said 41 | metadata simply from the core file itself, without external dependencies. 42 | 43 | This metadata is stored as a section in the executable file, 44 | so that it will be loaded into memory along with the text and data of the binary, 45 | and will be preserved in a core dump. 46 | This metadata can also be easily read from the file on disk, 47 | so it can be used to identify provenience of files, 48 | independently of any package management system, 49 | even if the file is renamed or copied. 50 | 51 | ## Implementation 52 | 53 | This document will attempt to define a common metadata format specification, so that 54 | multiple implementers might use it when building packages, or core file analyzers, and 55 | so on. 56 | 57 | Implementers working on parsing the metadata should not assume a specific list of names, 58 | but parse anything that is included in the JSON object. 59 | 60 | Implementers working on build tools should strive to use the same names, for consistency. 61 | The most common will be listed here. 62 | When corresponding to the content of os-release, the values should match, again for consistency. 63 | 64 | If available, the metadata should also include the debuginfod server URL that can provide 65 | the original executable, debuginfo and sources, to further facilitate debugging. 66 | 67 | ### ELF header section 68 | 69 | The metadata will be embedded in a single, 4 byte-aligned, allocated, NUL-padded, 70 | read-only ELF header section, in a name-value JSON object format. 71 | The JSON string is terminated with a NUL 72 | and subsequently padded with NULs to a multiple of four bytes. 73 | 74 | The `note type` must be set during creation and checked when reading. 75 | 76 | Section: `.note.package`
77 | `note type`: `0xcafe1a7e`
78 | Owner: `FDO` (FreeDesktop.org)
79 | Value: a single JSON object encoded as a NUL-terminated UTF-8 string 80 | 81 | ### PE/COFF section 82 | 83 | The metadata will be embedded in a single, allocated, NUL-padded, 84 | read-only COFF data section, 85 | in a name-value JSON object format. 86 | The JSON string is terminated with a NUL 87 | and subsequently padded with NULs if appropriate. 88 | The `IMAGE_SCN_CNT_INITIALIZED_DATA` section flag shall be set. 89 | The alignment and padding shall be chosen as appropriate for the use of the PE/COFF file. 90 | 91 | Section: `.pkgnote`
92 | Value: a single JSON object encoded as a NUL-terminated UTF-8 string 93 | 94 | ### JSON payload 95 | 96 | ```json 97 | { 98 | "type":"rpm", # this provides a namespace for the package+package-version fields 99 | "os":"fedora", 100 | "osVersion":"33", 101 | "name":"coreutils", 102 | "version":"4711.0815.fc13", 103 | "architecture":"arm32", 104 | "osCpe": "cpe:2.3:o:fedoraproject:fedora:33", # A CPE name for the operating system, `CPE_NAME` from os-release is a good default 105 | "appCpe": "cpe:2.3:a:gnu:coreutils:5.0", # A CPE name for the upstream application, use NVD CPE search 106 | "debugInfoUrl": "https://debuginfod.fedoraproject.org/" 107 | } 108 | ``` 109 | 110 | The format is a single JSON object, 111 | encoded as a NUL-terminated `UTF-8` string. 112 | Each name in the object shall be unique as per recommendations of 113 | [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259#section-4). 114 | Strings shall not contain any control characters or use `\uXXX` escaping. 115 | 116 | When it comes to JSON numbers, this specification assumes that JSON parsers 117 | processing this information are capable of reproducing the full signed 53bit 118 | integer range (i.e. -2⁵³+1…+2⁵³-1) as well as the full 64-bit IEEE floating 119 | point number range losslessly (with the exception of NaN/-inf/+inf, since JSON 120 | cannot encode that), as per recommendations of 121 | [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259#page-8). Fields in 122 | these JSON objects are thus permitted to encode numeric values from these 123 | ranges as JSON numbers, and should not use numeric values not covered by these 124 | types and ranges. 125 | 126 | If available, the metadata should also include the debuginfod server URL that can provide 127 | the original executable, debuginfo and sources, to further facilitate debugging. 128 | 129 | Reference implementations of [packaging tools for .deb and .rpm](https://github.com/systemd/package-notes) 130 | are available, and provide macros/helpers to include the note in binaries built 131 | by the package build system. 132 | They make use of the new `--package-metadata=` flag that is available in the 133 | `bfd`, `gold`, `mold`, and `lld` linkers 134 | (versions 2.39, 2.39, 1.3.0, and 15.0 respectively). 135 | This linker flag takes the JSON payload as parameter. 136 | 137 | ## Well-known keys 138 | 139 | The metadata format is intentionally left open, so that vendors can add their own information. 140 | A set of well-known keys is defined here, and hopefully shared among all vendors. 141 | 142 | | Key name | Key description | Example value | 143 | |--------------|--------------------------------------------------------------------------|---------------------------------------| 144 | | type | The packaging type | rpm | 145 | | os | The OS name, typically corresponding to ID in os-release | fedora | 146 | | osVersion | The OS version, typically corresponding to VERSION_ID in os-release | 33 | 147 | | name | The source package name | coreutils | 148 | | version | The source package version | 4711.0815.fc13 | 149 | | architecture | The binary package architecture | arm32 | 150 | | osCpe | A CPE name for the OS, typically corresponding to CPE_NAME in os-release | cpe:2.3:o:fedoraproject:fedora:33 | 151 | | appCpe | A CPE name for the upstream Application, as found in [NVD CPE search] | cpe:2.3:a:gnu:coreutils:5.0 | 152 | | debugInfoUrl | The debuginfod server url, if available | https://debuginfod.fedoraproject.org/ | 153 | 154 | [NVD CPE search]: https://nvd.nist.gov/products/cpe/search 155 | 156 | ### Displaying package notes 157 | 158 | The raw ELF section can be extracted using `objdump`: 159 | ```console 160 | $ objdump -j .note.package -s /usr/bin/ls 161 | 162 | /usr/bin/ls: file format elf64-x86-64 163 | 164 | Contents of section .note.package: 165 | 03cc 04000000 7c000000 7e1afeca 46444f00 ....|...~...FDO. 166 | 03dc 7b227479 7065223a 2272706d 222c226e {"type":"rpm","n 167 | 03ec 616d6522 3a22636f 72657574 696c7322 ame":"coreutils" 168 | 03fc 2c227665 7273696f 6e223a22 392e342d ,"version":"9.4- 169 | 040c 372e6663 3430222c 22617263 68697465 7.fc40","archite 170 | 041c 63747572 65223a22 7838365f 3634222c cture":"x86_64", 171 | 042c 226f7343 7065223a 22637065 3a2f6f3a "osCpe":"cpe:/o: 172 | 043c 6665646f 72617072 6f6a6563 743a6665 fedoraproject:fe 173 | 044c 646f7261 3a343022 7d000000 dora:40"}... 174 | ``` 175 | 176 | It is more convenient to use a higher level tool: 177 | ```console 178 | $ readelf --notes /usr/bin/ls 179 | ... 180 | Displaying notes found in: .note.gnu.build-id 181 | Owner Data size Description 182 | GNU 0x00000014 NT_GNU_BUILD_ID (unique build ID bitstring) 183 | Build ID: 40e5a1570a9d97fc48f5c61cfb7690fec0f872b2 184 | 185 | Displaying notes found in: .note.ABI-tag 186 | Owner Data size Description 187 | GNU 0x00000010 NT_GNU_ABI_TAG (ABI version tag) 188 | OS: Linux, ABI: 3.2.0 189 | 190 | Displaying notes found in: .note.package 191 | Owner Data size Description 192 | FDO 0x0000007c FDO_PACKAGING_METADATA 193 | Packaging Metadata: {"type":"rpm","name":"coreutils","version":"9.4-7.fc40","architecture":"x86_64","osCpe":"cpe:/o:fedoraproject:fedora:40"} 194 | ... 195 | 196 | $ systemd-analyze inspect-elf /usr/bin/ls 197 | path: /usr/bin/ls 198 | elfType: executable 199 | elfArchitecture: AMD x86-64 200 | 201 | type: rpm 202 | name: coreutils 203 | version: 9.4-7.fc40 204 | architecture: x86_64 205 | osCpe: cpe:/o:fedoraproject:fedora:40 206 | buildId: 40e5a1570a9d97fc48f5c61cfb7690fec0f872b2 207 | ``` 208 | 209 | If the binary crashes, `systemd-coredump` will display the combined information 210 | from the crashing binary and any shared libraries it links to: 211 | 212 | ```console 213 | $ coredumpctl info 214 | PID: 3987823 (ls) 215 | Signal: 11 (SEGV) 216 | Command Line: ls --color=tty -lR / 217 | Executable: /usr/bin/ls 218 | ... 219 | Storage: /var/lib/systemd/coredump/core.ls.1000.88dea1b9831c420dbb398f9d2ad9b41e.3987823.1726230641000000.zst (present) 220 | Size on Disk: 194.4K 221 | Package: coreutils/9.4-7.fc40 222 | build-id: 40e5a1570a9d97fc48f5c61cfb7690fec0f872b2 223 | Message: Process 3987823 (ls) of user 1000 dumped core. 224 | 225 | Module /usr/bin/ls from rpm coreutils-9.4-7.fc40.x86_64 226 | Module libz.so.1 from rpm zlib-ng-2.1.7-1.fc40.x86_64 227 | Module libcrypto.so.3 from rpm openssl-3.2.2-3.fc40.x86_64 228 | Module libmount.so.1 from rpm util-linux-2.40.1-1.fc40.x86_64 229 | Module libcrypt.so.2 from rpm libxcrypt-4.4.36-5.fc40.x86_64 230 | Module libblkid.so.1 from rpm util-linux-2.40.1-1.fc40.x86_64 231 | Module libnss_sss.so.2 from rpm sssd-2.9.5-1.fc40.x86_64 232 | Module libpcre2-8.so.0 from rpm pcre2-10.44-1.fc40.x86_64 233 | Module libcap.so.2 from rpm libcap-2.69-8.fc40.x86_64 234 | Module libselinux.so.1 from rpm libselinux-3.6-4.fc40.x86_64 235 | Stack trace of thread 3987823: 236 | #0 0x00007f19331c3f7e lgetxattr (libc.so.6 + 0x116f7e) 237 | #1 0x00007f19332be4c0 lgetfilecon_raw (libselinux.so.1 + 0x134c0) 238 | #2 0x00007f19332c3bd9 lgetfilecon (libselinux.so.1 + 0x18bd9) 239 | #3 0x000056038273ad55 gobble_file.constprop.0 (/usr/bin/ls + 0x17d55) 240 | #4 0x0000560382733c55 print_dir (/usr/bin/ls + 0x10c55) 241 | #5 0x0000560382727c35 main (/usr/bin/ls + 0x4c35) 242 | #6 0x00007f19330d7088 __libc_start_call_main (libc.so.6 + 0x2a088) 243 | #7 0x00007f19330d714b __libc_start_main@@GLIBC_2.34 (libc.so.6 + 0x2a14b) 244 | #8 0x0000560382728f15 _start (/usr/bin/ls + 0x5f15) 245 | ELF object binary architecture: AMD x86-64 246 | ``` 247 | 248 | (This is just a simulation. `ls` is not prone to crashing with a segmentation violation.) 249 | -------------------------------------------------------------------------------- /specs/extension_image.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.4 Extension Images 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 4 8 | aliases: 9 | - /UAPI.4 10 | - /4 11 | --- 12 | # UAPI.4 Extension Images 13 | 14 | | Version | Changes | 15 | |---------|-----------------| 16 | | 1.0 | Initial Release | 17 | 18 | Extension Images are DDIs ([Discoverable Disk Images](discoverable_disk_image.md)) that are 19 | built to extend a base system via an overlay. A base system or a root DDI can be extended by several extension 20 | DDIs via, usually, a read-only OverlayFS. The defining characteristic of an Extension Image is that it contains 21 | an `extension-release.` file that identifies itself and the base system or root DDI it applies to, 22 | and must not contain an `os-release` file. 23 | 24 | ## Ordering 25 | The default order in which extensions are applied is based on lexicographic sorting as 26 | [defined in the Version Format Specification](version_format_specification.md), with images sorting as 27 | older being placed lower in the overlay. Implementations may allow a different order to be explicitly 28 | specified instead. 29 | 30 | ## Image Format 31 | Extensions are DDIs ([Discoverable Disk Images](discoverable_disk_image.md)), so the file format will not be 32 | redefined here. 33 | 34 | ## Extension Types 35 | There are two types of extension images, sysext (System Extension) and confext (Configuration Extension). 36 | They are differentiated by the directory hierarchies they contain. 37 | 38 | ### sysext (System Extension) 39 | sysext images extend `/usr/` (OS vendor tree) and/or `/opt/` (third-party vendor tree). They must contain a 40 | `/usr/lib/extension-release.d/extension-release.` file to identify them. 41 | 42 | ### confext (Configuration Extension) 43 | confext images extend `/etc`. They must contain a `/etc/extension-release.d/extension-release.` file 44 | to identify them. 45 | 46 | ## Image Content 47 | Extension Images should be additive, and not override content present in the base image or other DDIs. 48 | However, there currently is no safe and efficient way to detect collisions and to enforce content uniqueness 49 | across the stack of images. 50 | In a future version of this specification options for enforcing uniqueness may be provided. 51 | 52 | ## Base Directory Immutability / Mutability 53 | By default, applying ("merging") an Extension Image on a mutable filesystem renders the underlying base 54 | directory (`/etc/`, `/usr/`, `/opt/`) immutable. 55 | By implication, merging a confext on a mutable filesystem will result in `/etc/` becoming read-only, and 56 | merging a sysext might render `/usr/` and/or `/opt/` read-only, depending on the sysext's contents. 57 | 58 | This affects base directories actually contained in extensions merged to a root filesystem: 59 | for instance, if a sysext extends `/usr/` but not `/opt/`, `/opt/` will remain mutable if 60 | it's on a mutable filesystem. 61 | However, immutability affects the _full_ base directory. 62 | Merging an extension on a mutable filesystem that ships a single custom path e.g. below 63 | `/etc/appconfig-extra/some/sub/path/` will still render the entirety of `/etc/` immutable. 64 | The base directory or directories remain immutable for as long as extensions are merged. 65 | Mutability, if present before the merge, is regained only after all extensions overlaying a base directory 66 | have been un-merged. 67 | 68 | ### Optional Mutability 69 | While overlaid base directories are immutable by default, implementations may provide options 70 | for mutability. 71 | Retaining mutability may for instance be useful for compatibility with general purpose applications, 72 | enabling users to operate a "mixed mode" with both system and configuration extensions 73 | and traditional applications. 74 | Mixed mode may also be integrated by distributions to facilitate a smooth transition from 75 | traditional package management to a purely image-based composition of the root file system. 76 | Lastly, mutability mode may be used on an originally immutable filesystem to allow and to capture 77 | temporary changes. 78 | 79 | ### Extension Overlay [Im]Mutability Modes 80 | System and configuration extensions may operate in one of three modes. 81 | 82 | 1. *Immutable mode* - The overlaid base directory is immutable. 83 | This is the default. 84 | 2. *Mutable mode* - Writes are directed to an _upperdir_ specified by the user or operator 85 | (see "Mutability Mode Configuration" below). 86 | This _upperdir_ will contain all changes made to the overlaid base directory. 87 | 1. _Upperdir_ may be specified to _be_ the base directory: may be used to retain mutability 88 | of the base directory after extensions have been merged. 89 | 2. Alternatively, _upperdir_ may be an entirely separate directory: modifications will be captured 90 | but the base directory will remain unchanged, retaining its state from before the extension was merged. 91 | 3. *Ephemeral Mode* - Similar to mutable mode (2.) above but writes are only stored temporarily while 92 | extensions are merged, and discarded as soon as extensions are un-merged. 93 | Location of temporary storage is implementation-specific. 94 | Useful for e.g. development and for one-shot validation operations. 95 | 96 | ### Mutability Mode Configuration 97 | Immutable mode is the default. 98 | If none of the configurations outlined below were specified then extension overlays operate 99 | in immutable mode and base directories are read-only. 100 | Implementation of any of the below mutable configurations is optional. 101 | _If_ mutable modes are supported by an implementation, configuration option 1. below _must_ be supported 102 | for compatibility across implementations. 103 | 104 | Mutability modes may be configured in the following ways: 105 | 1. By creating qualified paths or soft-links below `/var/lib/extensions.mutable/`. 106 | See "Qualified Paths Definition" below for details. 107 | This is the most portable option across different implementations. 108 | If an implementation supports base directory mutability then this mode _must_ be supported. 109 | 2. By setting a respective option in an implementation's configuration file. 110 | This option is implementation-specific. 111 | Implementations may choose to support a single option, multiple options for 112 | system and configuration extensions, and/or multiple options per base directory. 113 | Using this option should selectively override any qualified path definitions from 1. 114 | 3. By passing a command line parameter upon extension merge or refresh. 115 | This option is implementation-specific. 116 | Implementations may choose to support a single option and/or multiple options per base directory. 117 | Using this option should selectively override any configurations from 1. and 2. 118 | 119 | #### Qualified Paths Definition 120 | 121 | Mutability Mode 1 enables mutability by creating paths or soft-links below 122 | `/var/lib/extensions.mutable/`. 123 | Qualified paths are: 124 | * `/var/lib/extensions.mutable/etc/` - directory or soft-link to a directory to store writes to 125 | `/etc/`. This is for configuration extensions. 126 | * `/var/lib/extensions.mutable/usr/` - directory or soft-link to a directory to store writes to 127 | `/usr/`. This is for system extensions. 128 | * `/var/lib/extensions.mutable/opt/` - directory or soft-link to a directory to store writes to 129 | `/opt/`. This is for system extensions. 130 | 131 | Each base directory is treated separately. 132 | The existence and the type of each qualified path determines the mutability mode used. 133 | The following mutability modes are supported: 134 | * _Path does not exist_ - immutable mode. 135 | * Path is a _directory, subvolume, or mount point_ - the path at 136 | `/var/lib/extensions.mutable//` is used to store writes to `//`. 137 | * A tmpfs mount at the qualified path may be used for a custom ephemeral mode. 138 | In this case, clean-up of the tmpfs is left to the user and/or is implementation-specific. 139 | * Path is a _soft link_ - the soft link is followed and writes are stored at the link's destination. 140 | * If the destination is the base directory - i.e. `/var/lib/extensions.mutable//` 141 | points to `//` - then the stacking order changes and `` becomes _upperdir_. 142 | Writes are directed to the base directory and files and paths present in the base directory 143 | override files and paths in extensions if present. 144 | * The soft link may point to a tmpfs destination for custom ephemeral mode. 145 | In this case, clean-up of the tmpfs is left to the user and/or is implementation-specific. 146 | * If the destination does not exist, immutable mode is used. 147 | 148 | ## File Suffix 149 | Since extensions images are DDIs, they should carry the `.raw` suffix. In order to make discerning system 150 | extensions and configuration extensions easy it is recommended to use the `.sysext.raw` suffix for system 151 | extensions, and `.confext.raw` for configuration extensions. 152 | 153 | ## Identification 154 | An Extension Image must contain a `extension-release.` file, where `` must either match the 155 | name of the sysext minus the suffix, or alternatively `extension-release.` must be tagged with a 156 | `user.extension-release.strict` xattr set to the string `"0"` in order to be valid. This is to make it 157 | obvious to users that a sysext is used for its purpose. 158 | The format of `extension-release.` is the same as the 159 | [`os-release` file](https://www.freedesktop.org/software/systemd/man/os-release.html), and it is a 160 | newline-separated list of environment-like shell-compatible variable assignments. New fields 161 | `SYSEXT_LEVEL=` and `CONFEXT_LEVEL=` have been introduced to allow an implementation to match a sysext or 162 | a confext with the base image upon which it is layered: if the field is present, it must match between the 163 | layers or the Extension Image must be ignored, while if it is not present, but `VERSION_ID=` is, then the 164 | latter must match instead. 165 | In addition, the `ID=` field must be present and match the base image's, or be set to the special value 166 | `_any`, in case the Extension Image can be used on any Linux distribution. 167 | 168 | ### Fields in extension-release — Matching with the base system or DDI 169 | The following fields are used in order to match with the base system or DDI. 170 | #### `SYSEXT_LEVEL=` `CONFEXT_LEVEL=` 171 | A lower-case string (mostly numeric, no spaces or other characters outside of 0–9, a–z, ".", "_" and 172 | "-") identifying the operating system extensions support level, to indicate which extension images are 173 | supported. 174 | 175 | Examples: `"SYSEXT_LEVEL=2"`, `"CONFEXT_LEVEL=15.14"`. 176 | 177 | If not present, and if `VERSION_ID=` is present instead, then this will be checked instead. 178 | 179 | #### `VERSION_ID=`, `ID=`, `ARCHITECTURE=` 180 | `VERSION_ID=` and `ID=` are used to match the Extension Image with the root DDI, and `ARCHITECTURE=` is used 181 | to match with the host's CPU architecture, as defined in the 182 | [`os-release` specification](https://www.freedesktop.org/software/systemd/man/os-release.html). 183 | `ID=` and `ARCHITECTURE=` also support specifying the `_any` wildcard, which allows the matching mechanism 184 | to be bypassed. 185 | 186 | ### Fields in extension-release — Identifying the Extension Image 187 | The identification fields defined in the 188 | [`os-release` specification](https://www.freedesktop.org/software/systemd/man/os-release.html) 189 | can be used to also identify the sysext itself, by prefixing them with `SYSEXT_`. For example, 190 | `SYSEXT_ID=myext` `SYSEXT_VERSION_ID=0.1` denotes a 'myext' sysext of version '0.1'. 191 | There are also extension-specific fields that do not apply to 'os-release', `SYSEXT_SCOPE=`, 192 | `CONFEXT_SCOPE=` and `ARCHITECTURE=`. 193 | 194 | #### `SYSEXT_SCOPE=`, `CONFEXT_SCOPE=` 195 | Takes a space-separated list of one or more of the strings `"system"`, `"initrd"` and `"portable"`. This field 196 | is optional and indicates what environments the system extension is applicable to: i.e. to regular systems, 197 | to initrds, or to [portable service images](https://systemd.io/PORTABLE_SERVICES/). If unspecified, 198 | `"SYSEXT_SCOPE=system portable"` is implied, i.e. any system extension without this field is applicable to 199 | regular systems and to portable service environments, but not to initrd environments. 200 | 201 | #### `ARCHITECTURE=` 202 | A string that specifies which CPU architecture the userspace binaries require. This field is optional 203 | and should only be used when just single architecture is supported. It may provide redundant 204 | information when used in a GPT partition with a GUID type that already encodes the architecture. If this 205 | is not the case, the architecture should be specified in e.g., an extension image, to prevent an 206 | incompatible host from loading it. 207 | 208 | Valid values: 209 | 210 | | Architecture | 211 | |--------------| 212 | | x86 | 213 | | x86-64 | 214 | | alpha | 215 | | arc | 216 | | arc-be | 217 | | arm | 218 | | arm-be | 219 | | arm64 | 220 | | arm64-be | 221 | | cris | 222 | | ia64 | 223 | | loongarch64 | 224 | | m68k | 225 | | mips | 226 | | mips-le | 227 | | mips64 | 228 | | mips64-le | 229 | | parisc | 230 | | parisc64 | 231 | | ppc | 232 | | ppc-le | 233 | | ppc64 | 234 | | ppc64-le | 235 | | riscv32 | 236 | | riscv64 | 237 | | s390 | 238 | | s390x | 239 | | sh | 240 | | sh64 | 241 | | sparc64 | 242 | | sparc | 243 | | tilegx | 244 | | native | 245 | | any | 246 | -------------------------------------------------------------------------------- /specs/linux_tpm_pcr_registry.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.7 Linux TPM PCR Registry 3 | version: 1.0 4 | SPDX-License-Identifier: CC0-1.0 5 | weight: 7 6 | aliases: 7 | - /UAPI.7 8 | - /7 9 | --- 10 | 11 | # 🔏 UAPI.7 Linux TPM PCR Registry 🗒️ 12 | 13 | | Version | Changes | 14 | |---------|-----------------| 15 | | 1.0 | Initial Release | 16 | 17 | _TPM PCRs are a scarce resource, there are only 24 of them in typical standards compliant TPMs. 18 | According to the 19 | [TCG PC Client Specific Platform Firmware Profile Specification | Trusted Computing Group](https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/) 20 | the OS can make use of PCRs 8…15. 21 | This document lists which component is using which PCR on a Linux platform 22 | in order to minimize conflicts._ 23 | 24 | PCRs owned by the firmware, i.e. PCRs 0–7 are described here just for convenience. 25 | The authoriative description is in the TCG document. 26 | 27 | How other operating systems — in particular Windows — use PCRs, is out of scope of this document. 28 | 29 | This document is informational in nature: it just describes what is, it is not intended to formally declare “ownership” of a specific PCR, but simply is supposed to reflect which PCR assignments are common in the Linux ecosystems. That said, co-opting PCR usage will likely create problems down the line, in particular if measurement logs are maintained separately. (To be more explicit: on `systemd` systems the warranty is voided if you write to the PCRs it also uses, as per the list below.) 30 | 31 | PCR measurements most commonly serve two distinct purposes: 32 | 33 | * To implement access policy on TPM sealed objects: policy can dictate that unsealing of such objects shall only be allowed if some PCRs are in a specific literal state, or in any state for which a signature by a specific key pair can be provided. For this it is essential that PCRs only contain measurements for a clearly defined set of objects, that typically is known in advance so that the PCR value can be pre-calculated (hence this is in a way a _forward_-looking use) 34 | * To permit reasoning about the boot process and runtime _so far_, for example for the purpose of remote attestation. In this case it is not that important what objects are measured as long as a record is kept in a measurement log about what it was. The PCRs are in this case used to validate that log (hence this is in a way a _backward_-looking use) 35 | 36 | In both cases it is important that data measured into the PCRs is carefully chosen. PCRs that shall be useful for policy binding should only cover data objects known in advance, and thus not contain runtime data that cannot be pre-calculated in advance. PCRs that shall be useful for backward-looking validation should only cover objects that are also written to the appropriate log for the PCR. 37 | 38 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 |

PCR#

Used byFrom LocationMeasured ObjectsLogUse Reported By

0

Firmware 💻UEFI Boot ComponentCore system firmware executable codeUEFI TPM event logn/a

1

Firmware 💻UEFI Boot ComponentCore system firmware data/host platform configuration; typically contains serial and model numbersUEFI TPM event logn/a

2

Firmware 💻UEFI Boot ComponentExtended or pluggable executable code; includes option ROMs on pluggable hardwareUEFI TPM event logn/a

3

Firmware 💻UEFI Boot ComponentExtended or pluggable firmware data; includes information about pluggable hardwareUEFI TPM event logn/a

4

Firmware 💻UEFI Boot ComponentBoot loader and additional drivers; binaries and extensions loaded by the boot loaderUEFI TPM event logn/a

5

Firmware 💻UEFI Boot ComponentGPT/Partition tableUEFI TPM event logn/a
systemd-boot 🚀UEFI Boot ComponentUsed loader.confUEFI TPM event logn/a

7

Firmware 💻UEFI Boot ComponentSecureBoot stateUEFI TPM event logn/a

8

grub 🍲UEFI Boot ComponentCommands and kernel command lineUEFI TPM event logn/a

9

grub 🍲UEFI Boot ComponentAll files read (including kernel image)UEFI TPM event logn/a
Linux kernel 🌰KernelAll passed initrds (when the new LOAD_FILE2 initrd protocol is used)UEFI TPM event logn/a
systemd-tpm2-setup.service 🚀UserspaceState of each NvPCR after anchor measurement/run/log/systemd/tpm2-measure.logn/a
systemd-pcrnvdone.service 🚀UserspaceNvPCR anchor measurement separator/run/log/systemd/tpm2-measure.logn/a

10

IMA 📐KernelProtection of the IMA measurement logIMA event logn/a

11

systemd-stub 🚀UEFI StubAll components of unified kernel images (UKIs)UEFI TPM event login EFI variable StubPcrKernelImage
systemd-pcrphase 🚀UserspaceBoot phase strings, indicating various milestones of the boot process/run/log/systemd/tpm2-measure.logn/a

12

systemd-stub 🚀UEFI StubKernel command line, system credentials, system configuration images, initrd addons, µcode addons, devicetree addonsUEFI TPM event login EFI variable StubPcrKernelParameters

13

systemd-stub 🚀UEFI StubAll system extension images for the initrdUEFI TPM event login EFI variable StubPcrInitRDSysExts

14

shim 🔑UEFI Boot Component“MOK” certificates and hashesUEFI TPM event logn/a

15

systemd-cryptsetup@.service 🚀UserspaceRoot file system volume encryption key/run/log/systemd/tpm2-measure.logn/a
systemd-pcrmachine.service 🚀UserspaceMachine ID (/etc/machine-id)/run/log/systemd/tpm2-measure.logn/a
systemd-pcrfs@.service 🚀UserspaceFile system mount point, UUID, label, partition UUID label of root file system and /var//run/log/systemd/tpm2-measure.logn/a
282 | 283 | PCR 0 changes on firmware updates; PCR 1 changes on basic hardware/CPU/RAM replacements. 284 | 285 | PCR 4 changes on boot loader updates. 286 | The shim project will measure the PE binary it chain loads into this PCR. 287 | If the Linux kernel is invoked as UEFI PE binary, it is measured here, too. 288 | [systemd-stub](https://www.freedesktop.org/software/systemd/man/systemd-stub.html) 289 | measures system extension images read from the ESP here too 290 | (see [systemd-sysext](https://www.freedesktop.org/software/systemd/man/systemd-sysext.html) 291 | and [Extension Images](extension_image.md)). 292 | 293 | PCR 5 changes when partitions are added, modified, or removed. 294 | 295 | PCR 7 changes when UEFI SecureBoot mode is enabled/disabled, or firmware certificates (PK, KEK, db, dbx, …) are updated. 296 | The shim project will measure most of its (non-MOK) certificates and SBAT data into this PCR. 297 | 298 | PCR 11 and 15 as shown in the list above are used by multiple components of systemd. 299 | These are not conflicting uses; 300 | the involved components are properly ordered to cooperatively guarantee predictable behaviour. 301 | 302 | [systemd-stub](https://www.freedesktop.org/software/systemd/man/systemd-stub.html) 303 | measures the ELF kernel image, embedded initrd and other payload of the PE image into PCR 11. 304 | Unlike PCR 4 (where the same data should be measured too), those values should be easy to pre-calculate, 305 | as they only reflect the static parts of the PE binary. 306 | [systemd-pcrphase.service](https://www.freedesktop.org/software/systemd/man/systemd-pcrphase.service.html) 307 | measures boot phase strings into this PCR at various milestones of the boot process. 308 | Use PCR 11 to bind TPM policies to a specific kernel image, possibly with an embedded initrd, 309 | and to a specific boot phase. 310 | 311 | [systemd-boot](https://www.freedesktop.org/software/systemd/man/systemd-boot.html) 312 | measures the kernel command line into PCR 12. 313 | [systemd-stub](https://www.freedesktop.org/software/systemd/man/systemd-stub.html) 314 | measures any manually specified kernel command line (i.e. a kernel command line that overrides the one embedded in the UKI) 315 | and loaded credentials into this PCR. 316 | This means that if `systemd-boot` and `systemd-stub` are used together, the command line might be measured twice. 317 | 318 | [systemd-stub](https://www.freedesktop.org/software/systemd/man/systemd-stub.html) 319 | measures any [Extension Images](extension_image.md) 320 | it passes to the booted kernel into PCR 13. 321 | 322 | [systemd-cryptsetup](https://www.freedesktop.org/software/systemd/man/systemd-cryptsetup.html) 323 | optionally measures the volume key of activated LUKS volumes into this PCR. 324 | [systemd-pcrmachine.service](https://www.freedesktop.org/software/systemd/man/systemd-pcrmachine.service.html) 325 | measures the 326 | [machine-id](https://www.freedesktop.org/software/systemd/man/machine-id.html) 327 | into this PCR. 328 | [systemd-pcrfs@.service](https://www.freedesktop.org/software/systemd/man/systemd-pcrfs@.service.html) 329 | measures mount points, file system UUIDs, labels, partion UUIDs 330 | of the root and `/var/` filesystems into this PCR. 331 | 332 | ## Sources 333 | * [systemd-cryptenroll(1)](https://www.freedesktop.org/software/systemd/man/systemd-cryptenroll.html#--tpm2-pcrs=PCR) 334 | * [TCG PC Client Specific Platform Firmware Profile Specification](https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/) 335 | * [shim's README.tpm](https://github.com/rhboot/shim/blob/main/README.tpm) 336 | * [Measured Boot - GNU GRUB Manual 2.06](https://www.gnu.org/software/grub/manual/grub/html_node/Measured-Boot.html) 337 | * [Integrity Measurement Architecture (IMA)](https://sourceforge.net/p/linux-ima/wiki/Home/) 338 | * [edk2-TrustedBootChain/4_Other_Trusted_Boot_Chains.md](https://github.com/tianocore-docs/edk2-TrustedBootChain/blob/main/4_Other_Trusted_Boot_Chains.md) 339 | * [Trusted Platform Module - ArchWiki](https://wiki.archlinux.org/title/Trusted_Platform_Module#Accessing_PCR_registers) 340 | -------------------------------------------------------------------------------- /specs/unified_kernel_image.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.5 Unified Kernel Images 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 5 8 | aliases: 9 | - /UAPI.5 10 | - /5 11 | --- 12 | # UAPI.5 Unified Kernel Images (UKI) 13 | 14 | | Version | Changes | 15 | |---------|-----------------| 16 | | 1.0 | Initial Release | 17 | 18 | A Unified Kernel Image (UKI) is a combination of an UEFI boot stub program, 19 | a Linux kernel image, an optional initrd, and further resources in a single UEFI PE file. 20 | This file can either be directly invoked by the UEFI firmware 21 | (which is useful in particular in some cloud/Confidential Computing environments) 22 | or through a boot loader 23 | (which is generally useful to allow multiple kernel versions with interactive or 24 | automatic selection of version to boot into). 25 | 26 | Various components of the UKI are provided as PE/COFF sections of the 27 | executable. The stub is a small program that can be executed in UEFI 28 | mode that forms the initial executable part of the combined image. 29 | The stub program loads other resources from its executable, including 30 | in particular the kernel and initrd, and transitions into the kernel. 31 | 32 | This specification defines the format and components (mandatory and optional) of UKIs. 33 | 34 | [systemd-stub](https://www.freedesktop.org/software/systemd/man/systemd-stub.html) 35 | provides the reference implementation of the stub. 36 | 37 | ## UKI File Format 38 | The file format for UKIs is PE/COFF (Portable Executable / Common Object File Format). This is a well-known 39 | industry-standard file format, used for example in UEFI environments, and UKIs follow the standard, so exact 40 | details will not be repeated here. 41 | 42 | UKIs are a PE/COFF file with various resources, listed below, stored in PE sections. 43 | In principle this file can be created with a relatively simple `objcopy` invocation, 44 | but the recommended way is to use a helper program 45 | ([`ukify`](https://www.freedesktop.org/software/systemd/man/ukify.html)), 46 | which takes care of appropriate alignment and facilitates signing for SecureBoot. 47 | 48 | UKIs are UEFI applications images, and hence should initialize the `Subsystem` field of the *optional* PE 49 | header to 0x0A (i.e. `IMAGE_SUBSYSTEM_EFI_APPLICATION`). 50 | 51 | ## UKI Components 52 | UKIs consist of the following resources: 53 | 54 | * An UEFI boot stub that forms the initial program. 55 | It contains various PE sections normally required for a program, 56 | including `.text`, `.reloc`, `.data`, and others. 57 | * The Linux kernel in the `.linux` PE section. 58 | * Optionally, information describing the OS this kernel is intended for, in the `.osrel` section. The contents of this section are derived from `/etc/os-release` of the target OS. They can be useful for presentation of the UKI in the boot loader menu, and ordering it against other entries using the included version information. 59 | * Optionally, the kernel command line in the `.cmdline` section. If this is absent, the loader implementation may allow local overrides instead. 60 | * Optionally, the initrd that the kernel shall unpack and invoke, in the `.initrd` section. 61 | * Optionally, a microcode initrd in the `.ucode` section, to be handed to the kernel before any other initrd. 62 | * Optionally, a splash image to bring to screen before transitioning into the Linux kernel, in the `.splash` section. 63 | * Optionally, a compiled Devicetree, for systems which need it, in the `.dtb` section. 64 | * Optionally, one or more compiled Devicetrees, for systems which need it, each in a separate `.dtbauto` section. The first `.dtbauto` section that matches the current hardware (matching is done either by the first `compatible` property with one from the firmware-provided Devicetree or by the SMBIOS fields using the contents of `.hwids` section as described below) will override the `.dtb` section. 65 | * Optionally, a hardware identification table (also known as [HWID](https://github.com/fwupd/fwupd/blob/main/docs/hwids.md) or [CHID](https://learn.microsoft.com/en-us/windows-hardware/drivers/install/specifying-hardware-ids-for-a-computer)) in the `.hwids` section. 66 | * Optionally, information describing kernel release information (i.e. `uname -r` output) in the `.uname` section. This is also useful for presentation of the UKI in the boot loader menu, and ordering it against other entries. 67 | * Optionally, a CSV file encoding the SBAT metadata for the image, in the `.sbat` section. The [SBAT format is defined by the Shim project](https://github.com/rhboot/shim/blob/main/SBAT.md), and used for UEFI revocation purposes. 68 | * Optionally, a JSON file encoding expected PCR 11 hash values seen from userspace once the UKI has booted up, along with signatures of these expected PCR 11 hash values, in the `.pcrsig` section. The signatures must also match the key pair described below. 69 | * Optionally, the public part of a public-private key pair in PEM format used to sign the expected PCR 11 value of the image, in the `.pcrpkey` section. 70 | 71 | Note that all of the sections defined above are singletons: 72 | they may appear at most once, 73 | except for the `.dtbauto` section which may appear multiple times. 74 | 75 | Only the `.linux` section is required for the image to be considered a Unified *Kernel* Image. 76 | 77 | A UKI will generally also contain various sections required for the boot stub, 78 | but we don't document those here. 79 | 80 | Boot menus such as [sd-boot](http://www.freedesktop.org/software/systemd/man/sd-boot.html) 81 | and other consumers of UKIs may place additional requirements, 82 | for example only show kernels with the `.osrel` section present. 83 | 84 | ## PE Addons 85 | 86 | UKIs are PE executables that may be executed directly in UEFI mode, and contain a variety of resources 87 | built-in, as described above. Sometimes it's useful to provide a minimal level of modularity and extend UKIs 88 | dynamically with additional resources from separate files. For this purpose UKIs can be combined with one or 89 | more "PE Addons". This are regular PE UEFI application binaries, that can be authenticated via the usual UEFI 90 | SecureBoot logic, and may contain additional PE sections from the list above, that shall be used in 91 | combination with any PE sections of the UKI itself. At UKI invocation time, the EFI stub contained in the UKI 92 | may load additional of these PE Addons and apply them (after authenticating them via UEFI APIs), combining 93 | them with the resources of the UKI. 94 | 95 | PE Addons may *not* contain `.linux` PE sections (this may be used to distinguish them from UKIs, which must 96 | have this section, see above). 97 | 98 | PE Addons must contain at least one section of the following types: 99 | 100 | * `.cmdline` 101 | * `.dtb` 102 | * `.dtbauto` 103 | * `.ucode` 104 | * `.initrd` 105 | 106 | PE Addons should be sorted by their filename, and applied in this order. In case of `.cmdline` all command 107 | lines provided by addons are suffixed in this order to any command line included in the UKI. In case of 108 | `.dtb` and `.dtbauto` any such section included in the UKI shall be applied first, and those provided by add-ons should then 109 | by applied in order as a fix-up. In case of `.ucode` the contained `cpio` archives should be prefixed to the 110 | regular initrds passed to the kernel, in reverse order. In case of `.initrd` the contained `cpio` archives 111 | should be appended to the regular initrds passed to the kernel. 112 | 113 | PE Addons may include sections of multiple types (e.g. both a `.cmdline` and a `.dtb` section), in which case 114 | all of them should be applied. 115 | 116 | Just like UKIs PE Addons should have the `Subystem` field of the *optional* PE header set to 0x0A. 117 | 118 | The PE header's `Machine` field should be set to the local CPU type for the target machine of the Addon. When 119 | enumerating PE Addons to apply, candidates should be skipped when their header field reports a non-native CPU 120 | architecture. 121 | 122 | PE Addons may contain executable code in a `.text` section. This code may be useful to write a friendly error 123 | message to the UEFI console when executed as regular programs. The code should be ignored when the addon is 124 | applied on an UKI. 125 | 126 | ## UKI TPM PCR Measurements 127 | 128 | On systems with a Trusted Platform Module (TPM) 129 | the UEFI boot stub shall measure the sections listed above, 130 | starting from the `.linux` section, 131 | in the order as listed 132 | (which should be considered the *canonical order*). 133 | The `.pcrsig` section is not measured. 134 | 135 | For each section two measurements shall be made into PCR 11 with the 136 | event code `EV_IPL`: 137 | 138 | 1. The section name in ASCII (including one trailing NUL byte) 139 | 2. The (binary) section contents 140 | 141 | The above should be repeated for every section defined above, so that 142 | the measurements are interleaved: section name followed by section 143 | data, followed by the next section name and its section data, and so 144 | on. 145 | 146 | If multiple `.dtbauto` sections are present, only the one that is actually in use shall be measured. 147 | 148 | ## JSON Format for `.pcrsig` 149 | The format is a single JSON object, encoded as a zero-terminated `UTF-8` string. Each name in the object 150 | shall be unique as per recommendations of 151 | [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259#section-4). Strings shall not contain any control 152 | character, nor use `\uXXX` escaping. 153 | 154 | When it comes to JSON numbers, this specification assumes that JSON parsers processing this information 155 | are capable of reproducing the full signed 53bit integer range (i.e. -2⁵³+1…+2⁵³-1) as well as the full 156 | 64bit IEEE floating point number range losslessly (with the exception of NaN/-inf/+inf, since JSON cannot 157 | encode that), as per recommendations of [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259#page-8). 158 | Fields in these JSON objects are thus permitted to encode numeric values from these ranges as JSON numbers, 159 | and should not use numeric values not covered by these types and ranges. 160 | 161 | The content is a JSON object, named after the TPM SHA bank to use, containing an array of measurement 162 | objects, each containing an array of PCRs, the SHA256 fingerprint of the public key (DER) used for the 163 | signature (`pkfp`), the expected hash (`pol`) and the signature encoded in base64 (`sig`). 164 | 165 | Example: 166 | 167 | ``` 168 | { 169 | "sha1": [ 170 | { 171 | "pcrs": [ 172 | 11 173 | ], 174 | "pkfp": "2870989436ec5c24461f36f5f070613043c30a156a895903e27fc985d1b2887f", 175 | "pol": "4a5cfbca5123490989ac060ec8b1755cfa6f0ea37ec39206e988442a9a9023bb", 176 | "sig": "X9a07Peo0EaEWr0dfUgZIq3Bsf20AGTjAgMilyH3TkLtPBGJLCEFRzK2jkPohG0VXQjao35765Wp/sV1wfctGC0fx9GOsBzK8YKjsFitOw21aLxlnES31D3PbDLPRqkx+fAhwV0/Akd99hNuiyzGdUewNpbbBNo7WXkd4K62RK61dKKI4g//qtLeAyXlee0TLKVxNcT46Ud1t8eUb1GAwRnO7DxBZx8uFyP/D9wpPNK7+M01to74d9ijcsjLXf2eGKcpiDvenUnhI6ua+OvT6CnmgxkFQutLGz/Ka23spSG/YJHfxGT7VpOYveDG19nqBb/fg30HZiY7lVTolS93UA==" 177 | } 178 | ], 179 | "sha256": [ 180 | { 181 | "pcrs": [ 182 | 11 183 | ], 184 | "pkfp": "2870989436ec5c24461f36f5f070613043c30a156a895903e27fc985d1b2887f", 185 | "pol": "707f5d03325822b2a53bfe5d723e0ca290f397c0e6184131b70d00e35224488a", 186 | "sig": "moQh6GF18LiVlA8CxRkTtbXr2p0NIIBosLazDALZ9lOJQw/w1PB7tcDZ1Kumvzqtx4FO5WVjOkVTnNFrYmXn9K2PpqIDEuTtwaM/lKgP12LtcC635C+VsJMQg3k9sEFfLwBCzrhYxt5GCpxzPrsfwJtsUpueB23sNw27WJS7C+tVnqWw7br6i9vJ59jP9+HXlex+OlZHliHLzZwpuZA8iPMQT0xvm901ak5yoBqNPv4Yya19dlt2sCuO+Iw1LeZW9U83zdG0hn1mxavRIxZ7s0f7a1n/ScrOksgPQB8xfDdFDf9fssGALanOgjCHyD7hRzV31++Qpgah4uc/LJiesg==" 187 | } 188 | ] 189 | } 190 | ``` 191 | 192 | The [`systemd-measure`](https://www.freedesktop.org/software/systemd/man/systemd-measure.html) tool can be 193 | used to generate and sign `.pcrsig`. 194 | 195 | ## Multi-Profile UKIs 196 | 197 | In various contexts it is useful to support multiple different configurations ("profiles") an UKI may be 198 | booted into. An example: a single UKI that can be booted with one of three different kernel command lines, 199 | one covering regular boot, one implementing a factory reset logic, and a third one booting into Storage 200 | Target Mode, or similar. In order to support this, *Multi-Profile UKIs* may be defined, as an optional 201 | extension of the regular UKI concept described above. 202 | 203 | Multi-profile UKIs extend regular UKIs by introducing an additional PE section with the name `.profile` which 204 | can appear multiple times in a single PE file and both acts as a separator between multiple profiles of the 205 | same UKI, and carries meta-information about the profile it is introducing. All regular UKI PE sections 206 | listed above may appear multiple times in multi-profile UKIs, but only once before the first `.profile` PE 207 | section, once between each subsequent pair of `.profile` sections, and once after the last `.profile` (except 208 | for `.dtbauto`, which is allowed to be defined multiple times anyway, see above). Each `.profile` section 209 | introduces and defines a profile, which are numbered from zero, and typically denoted with an `@` character 210 | before the profile number, i.e. `@0`, `@1`, `@2`, … The sections listed in the PE binary before the first 211 | `.profile` section make up a special profile called the *base profile*. 212 | 213 | When a multi-profile UKI is invoked, the EFI stub code will make sure to load the PE sections matching the 214 | selected profile. A profile is (optionally) selected by prefixing the EFI stub's invocation parameters 215 | ("command line") with `@0 `, `@1 `, `@2 `, (i.e. an `@` character, the numeric profile index, and a space 216 | character) in order to select the desired profile. The stub combines the PE sections of the selected profile 217 | with any PE sections from the base profile that are not specified in the selected profile. Or in other words: 218 | sections associated with specific profiles comprehensively override those of the same name in the base 219 | profile. If a multi-profile UKI is invoked without specification of a profile selector on its command line, 220 | profile `@0` is automatically selected as default. 221 | 222 | The profile selector prefix of the UKI's invocation parameters is stripped after parsing, and is thus neither 223 | passed on to the invoked kernel on the kernel's command line, nor is measured as part of the kernel command 224 | line. 225 | 226 | When measuring PE sections before passing control to the contained kernel, only the sections associated with 227 | the selected profile, or the base profile are measured. All others are ignored (neither measured nor used in 228 | any other way). 229 | 230 | A `.profile` section may optionally contain meta-information about the profile it introduces that a boot menu 231 | can use to automatically synthesize menu entries from the profiles a UKI defines. It contains text data, 232 | following a similar syntax as `.osrel` sections: environment-block like key-value pairs. Currently, two 233 | fields are defined: `ID=` may contain a brief textual, 7bit ASCII identifier for the profile. `TITLE=` may 234 | contain a brief human readable text string that may be shown in a boot menu that allows profile selection. 235 | 236 | A brief example for the structure of a hypothetical multi-profile UKI: 237 | 238 | | Section | Contents | Profile | 239 | |----------------|-------------------------------------------------------------|---------| 240 | | `.linux` | ELF kernel | Base | 241 | | `.osrel` | `/etc/os-release` | Base | 242 | | `.cmdline` | `"quiet"` | Base | 243 | | **`.profile`** | `ID=regular TITLE="Regular boot"` | `@0` | 244 | | **`.profile`** | `ID=factory-reset TITLE="Reset Device to Factory Defaults"` | `@1` | 245 | | `.cmdline` | `"quiet systemd.unit=factory-reset.target"` | `@1` | 246 | | **`.profile`** | `ID=storagetm TITLE="Boot into Storage Target Mode"` | `@2` | 247 | | `.cmdline` | `"quiet rd.systemd.unit=storage-target-mode.target"` | `@2` | 248 | 249 | (Note: in this example, the `.cmdline` shown as part of the base profile might as well be moved into profile 250 | `@0` with identical effect. This is because every other profile overrides it anyway, and thus it only applies 251 | to profile `@0` either way.) 252 | 253 | While the primary usecase for multi-profile UKIs are allowing multiple kernel command line sections 254 | (i.e. `.cmdline`) choices, the concept is not limited to that: any of the UKI PE sections may appear in 255 | profiles, for example to allow alternative selection of multiple different CPU microcode or Devicetree blobs. 256 | 257 | Note that if the PCR signature mechanism described above is used it is recommended to include a separate 258 | `.pcrsig` PE section in each profile matching precisely the sections that apply to that profile (i.e. the 259 | combination of the profile's own sections and those of the base section). 260 | 261 | ## Updatability 262 | UKIs wrap all of the above data in a single file, hence all of the above components can be updated in one go 263 | through single file atomic updates, which is useful given that the primary expected storage place for these 264 | UKIs is the UEFI System Partition (ESP), which is a vFAT file system, with its limited data safety guarantees. 265 | 266 | ## Security 267 | Given UKIs are regular UEFI PE files, they can thus be signed as one for Secure Boot, protecting all of the 268 | individual resources listed above at once, and their combination. Standard Linux tools such as 269 | [`sbsigntool`](https://manpages.debian.org/unstable/sbsigntool/sbsign.1.en.html) and 270 | [`pesign`](https://github.com/rhboot/pesign) can be used to sign UKI files. The signature format and process 271 | again match the ones already used for PE files, so they will not be redefined here. 272 | 273 | ## Locations for Distribution-built UKIs Installed by Package Managers 274 | 275 | UKIs that are built centrally by distributions and installed via the package manager should be installed in 276 | `/usr/lib/modules/$UNAME/`, where `$UNAME` is the output of `uname -r` of the kernel included in the UKI, so 277 | that tools staging or consuming UKIs have a common place to store and look for them. 278 | 279 | The installed UKIs should have a filename `.efi`, i.e. the filename is left to 280 | implementers but must be valid for comparisons according to the [Version Format Specification](version_format_specification.md). 281 | 282 | ## Locations and Naming for UKI Auxiliary Resources 283 | 284 | Auxiliary UKI resources (such as PE addons for kernel command line extensions and similar, as well as 285 | systemd-sysext and systemd-confext DDIs) built centrally by distributions and installed via package manager 286 | should be installed into locations depending on whether they should be applied to all UKIs installed in the 287 | ESP, or only to a single specific UKI. 288 | 289 | UKI auxiliary resources that apply to *all* installed UKIs should be 290 | installed into `/usr/lib/modules/uki.extra.d/`. UKI auxiliary resources that 291 | apply to *one* specific installed UKI should be instead installed into 292 | `/usr/lib/modules/$UNAME/$UKI.efi.extra.d/`, where `$UNAME` is the output of 293 | `uname -r` of the kernel included in the UKI and `$UKI` is the name of the 294 | corresponding centrally built UKI with the `.efi` extension stripped. 295 | 296 | The installed UKI auxiliary resources must have a specific file extension, which 297 | depends on the resource type: 298 | * `.addon.efi` for PE addons, 299 | * `.sysext.raw` for sysext DDIs, 300 | * `.confext.raw` for confext DDIs 301 | 302 | ### Example 303 | 304 | Given a UKI `bar_123.efi` that includes a kernel `6.9.1-1.foo`, consider 305 | * a PE addon `machine-id` that should apply to all installed UKIs, 306 | * a PE addon `proprietary-driver_2000` that is specific to the `bar_123` UKI, and 307 | * a sysext `mysysext_1.23.47^3` that should apply to all installed UKIs. 308 | 309 | The resulting paths would be 310 | * `/usr/lib/modules/uki.extra.d/machine-id.addon.efi`, 311 | * `/usr/lib/modules/6.9.1-1.foo/bar_123.efi.extra.d/proprietary-driver_2000.addon.efi`, and 312 | * `/usr/lib/modules/uki.extra.d/mysysext_1.23.47^3.sysext.raw`. 313 | -------------------------------------------------------------------------------- /specs/osc_context.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "UAPI.15 OSC 3008: Hierarchical Context Signalling" 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 15 8 | aliases: 9 | - /UAPI.15 10 | - /15 11 | --- 12 | 13 | # UAPI.15 OSC 3008: Hierarchical Context Signalling 14 | 15 | | Version | Changes | 16 | |---------|-----------------| 17 | | 1.0 | Initial release | 18 | 19 | A terminal connects a user with programs. Control of the program side of 20 | terminals is typically passed around to various different components while the 21 | user is active: a shell might pass control to a process it invokes. If that 22 | process is systemd's `run0` then primary control is passed to the privileged session of 23 | the target user. If `systemd-nspawn` is then invoked to start a container, primary 24 | control is passed to that container, and so on. 25 | 26 | A terminal emulator might be interested to know which component is currently in 27 | primary control of the program side of a terminal. OSC 3008 is a mechanism to 28 | inform it about such contexts. Each component taking over control can inform 29 | the terminal emulators that a new context begins now, and then use the terminal 30 | or pass control down to further apps, which can introduce contexts. Each 31 | context may carry various descriptive metadata fields. 32 | 33 | ## Use Cases 34 | 35 | Terminal emulators can use hierarchical context information: 36 | 37 | 1. To introduce markers/bookmarks in the output that the user can jump between. 38 | 39 | 2. To visually identify output from different contexts. For example the 40 | background of the associated output can be tinted in a reddish tone when 41 | privileges are acquired, and similar. 42 | 43 | 3. Meta information on specific output can be shown in a tooltip or similar 44 | 45 | 4. Programs (and all subcontexts) can be killed via a right-click menu on the 46 | output they generate. 47 | 48 | 5. Similar, a right-click menu might offer an item to offer opening a new 49 | interactive shell in the same working directory that was current on the 50 | selected context. 51 | 52 | 6. Failed commands or aborted sessions can be marked requesting user attention. 53 | 54 | 7. A "breadcrumb" style display can be implementing shown the nesting of contexts to the user. 55 | 56 | ## Context Types 57 | 58 | There are various types of contexts defined by this specification: 59 | 60 | 1. `boot` → a booted system initiates this context early at boot. (systemd's 61 | PID 1 generates this on `/dev/console`.) 62 | 63 | 2. `container` → a container manager initialized an interactive connection to a 64 | container. (`systemd-nspawn` generates this when interactively invoking a 65 | container. `machinectl login`, `machinectl shell` do this too.) 66 | 67 | 3. `vm` → a VM manager initialized a terminal connection to a 68 | VM. (`systemd-vmspawn` generates this when interactively invoking a VM, as 69 | one example.) 70 | 71 | 4. `elevate` → when the user interactively acquired higher privileges. (`run0` 72 | initiates a context of this type whenever the user invokes it to acquire 73 | root privileges.) 74 | 75 | 5. `chpriv` → similar, but when the user acquired *different* privileges, not 76 | necessarily higher ones. (`run0` initiates a context of this type whenever 77 | the user invokes it to acquire non-root privileges of another user.) 78 | 79 | 5. `subcontext` → similar, but the source and target privileges where 80 | identical. (`run0` initiates a context of this type whenever the user 81 | invokes it to acquire privileges of the user itself.) 82 | 83 | 6. `remote` → a user invoked a tool such as `ssh` to connect to a remote 84 | system. 85 | 86 | 7. `shell` → an interactive terminal shell initiates this context 87 | 88 | 8. `command` → a shell interactively invokes a new program. 89 | 90 | 9. `app` → an interactive program may initiate this context. 91 | 92 | 10. `service` → the service manager invokes an interactive service on the terminal 93 | 94 | 11. `session` → a login session of the user is initialized. 95 | 96 | ## Semantics 97 | 98 | Contexts in the sense of OSC 3008 are hierarchical, and describe a tree 99 | structure: whenever a new context is opened it becomes the new active context, 100 | and the previously active context becomes its parent (if there is one). Only 101 | one context is currently active, but previously opened contexts remain valid in 102 | the background. Any other data written or read should be considered associated 103 | with the currently active context. 104 | 105 | Each context carries an identifier, chosen by the component opening the 106 | context. The identifier can chosen freely, but must not be longer than 64 107 | characters. The characters may be in the 32…126 byte range. Identifiers should 108 | be universally unique, for example randomly generated. A freshly generated UUID 109 | would work well for this, but this could also be something like the Linux boot 110 | ID combined with the 64bit inode number of Linux pidfds, or something hashed 111 | from it. 112 | 113 | Fundamentally, there are two OSC 3008 commands defined: 114 | 115 | 1. OSC "`3008;start=`" … (the *start sequence*) → this initiates, updates or 116 | indicates a return to a context. It carries a context identifier, and 117 | typically some metadata. This may be sent to first initiate a context. If 118 | sent again for the same context ID that was initiated already this indicates 119 | an update of the existing context. In this case, *any* previously set 120 | metadata fields for the context are flushed out, reset to their defaults, 121 | and then reinitialized from the newly supplied data. Also, in this case any 122 | subcontexts of the contexts are implicitly terminated. 123 | 124 | 2. OSC "`3008;end=`" … (the *end sequence*) → this terminates a context. It 125 | carries a context identifier to close, initiated before with OSC 126 | "`3008;start=`". It may also carry additional metadata. 127 | 128 | ## General Syntax 129 | 130 | This builds on ECMA-48, and reuses the OSC and ST concepts introduced there. 131 | 132 | For sequences following this specification it is recommended to encode OSC as 133 | 0x1B 0x5D, and ST as 0x1B 0x5C. 134 | 135 | ECMA-48 only allows characters from the range 0x20…0x7e (i.e. 32…126) inside 136 | OSC sequences. However, most terminal emulators nowadays allow the ASCII byte 137 | range > 0x7f in the OSC sequences they process, and so does this 138 | specification. Control characters (< 0x20 and 0x7f) are not allowed. The 139 | semicolon character ("`;`") – which is used as field separator by this 140 | specification – shall be replaced by "`\x3b`" and the backslash character 141 | ("`\`") shall be replaced by "`\x5c`". All textual fields must be encoded in 142 | UTF-8, and then escaped with these two replacements. 143 | 144 | The start sequence begins with OSC, followed by the string `3008;start=`, 145 | followed by the context ID. This is then followed by any number of metadata 146 | fields, including none. Metadata fields begin with a semicolon (`;`) followed 147 | by in a string identifying the type of field, followed by an equal sign (`=`), 148 | and the field value. The sequence ends in ST. 149 | 150 | The end sequence begins with OSC, followed by the string `3008;end=`, followed 151 | by the context ID, and a series of metadata fields in the same syntax as for 152 | the start sequence. The sequence ends in ST. 153 | 154 | ## Metadata Fields 155 | 156 | The following fields are currently defined for the start sequence: 157 | 158 | | Field | Context Types | Description | 159 | |---------------|-------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------| 160 | | `type=` | *all* | Declares the context type, one of the types described above | 161 | | `user=` | *all* | UNIX user name the process issuing the sequence runs as | 162 | | `hostname=` | *all* | UNIX host name of the system the process issuing the sequence runs on | 163 | | `machineid=` | *all* | The machine ID (i.e. `/etc/machine-id`) of the system the process issuing the sequence runs on | 164 | | `bootid=` | *all* | The boot ID (i.e. `/proc/sys/kernel/random/boot_id`) of the system the process issuing the sequence runs on | 165 | | `pid=` | *all* | The numeric PID of the process issuing the sequence, in decimal notation | 166 | | `pidfdid=` | *all* | The 64bit inode number of the pidfd of the process issuing the sequence, in decimal notation | 167 | | `comm=` | *all* | The process name (i.e. `/proc/$PID/comm`, `PR_GET_NAME`) of the process issuing the sequence | 168 | | `cwd=` | `shell`, `command` | The current working directory | 169 | | `cmdline=` | `command` | The full command line of the invoked command | 170 | | `vm=` | `vm` | The name of the VM being invoked | 171 | | `container=` | `container` | The name of the container being invoked | 172 | | `targetuser=` | `elevate`, `chpriv`, `vm`, `container`, `remote`, `session` | Target UNIX user name | 173 | | `targethost=` | `remote` | Target UNIX, DNS host name, or IP address | 174 | | `sessionid=` | `session` | New allocated session ID | 175 | 176 | The following fields are currently defined for the end sequence: 177 | 178 | | Field | Context Types | Description | 179 | |-----------|---------------|------------------------------------------------------------------------------------------------------| 180 | | `exit=` | `command` | One of `success`, `failure`, `crash`, `interrupt`, indicating how the program terminated | 181 | | `status=` | `command` | The command's numeric exit status, i.e. the 0…255 value a program returns | 182 | | `signal=` | `command` | The termination signal of the command, if it died abnormally. A symbolic signal name. (`SIGKILL`, …) | 183 | 184 | All fields are optional, including the context type. However, it is generally 185 | recommended to always include the first 7 fields listed above, to make it easy 186 | to pinpoint the origin of a context in a race-free fashion, without any 187 | ambiguities. 188 | 189 | The order of the metadata fields is undefined, they may appear in any order 190 | (including that `type=` is specified at the very end or in the middle!). Note 191 | that `start=` and `end=` are not considered metadata fields but part of the 192 | start sequence, and hence must always appear right after OSC. 193 | 194 | ## Processing, Limits, Security 195 | 196 | All context information provided like this should be considered auxiliary and – 197 | to some degree – redundant information. Hence, it would be wise for a terminal 198 | to enforce limits on various resources, dropping additional data once these 199 | limits are hit. Most importantly, a maximum stacking depth should probably 200 | enforced: any attempts to initiate further contexts should be ignored once the 201 | stack limit is hit (i.e. the earlier contexts should be kept, the later 202 | contexts be discarded, not the opposite). Overly long fields should be 203 | discarded (or potentially truncated, depending on the field type). This 204 | specification does not recommend any specific stack or string limits for now. 205 | 206 | The usual terminal reset sequences should *not* affect the stack of contexts 207 | (this is a safety feature: a program down the stack should not be able to 208 | affect the stack further up, possibly hiding relevant information). A temporary 209 | TTY hangup (`vhangup()`) should result in a full reset of the stack. 210 | 211 | All provided data should be processed in a lenient, graceful fashion: if a 212 | sequence contains invalid fields, those fields should be ignored, but the rest 213 | of the fields should still be used. In particular, unknown fields should be 214 | ignored. 215 | 216 | The fields provided in these sequences should not contain sensitive 217 | information. Context IDs should not be considered confidential, but it is 218 | strongly recommended to generate them in a fashion that guarantees their 219 | sufficient uniqueness and avoids accidental or intended clashes with other 220 | contents. 221 | 222 | ## Examples 223 | 224 | 1. A new container `foobar` has been invoked by user `lennart` on host `zeta`: 225 | `OSC "3008;start=bed86fab93af4328bbed0a1224af6d40;type=container;user=lennart;hostname=zeta;machineid=3deb5353d3ba43d08201c136a47ead7b;bootid=d4a3d0fdf2e24fdea6d971ce73f4fbf2;pid=1062862;pidfdid=1063162;comm=systemd-nspawn;container=foobar" ST` 226 | 227 | 2. This context ends: `OSC "3008;end=bed86fab93af4328bbed0a1224af6d40" ST` 228 | 229 | ## Syntax in ABNF 230 | 231 | ```abnf 232 | OSC = %x1B %x5D 233 | ST = %x1B %x5C 234 | 235 | DECIMAL = "0"-"9" 236 | HEX = "0"-"9" / "A"-"F" / "a-f" 237 | ID128 = 32*36(HEX / "-") 238 | UINT64 = 1*20DECIMAL 239 | ESCSEMICOLON = "\x3b" 240 | ESCBACKSLASH = "\x5c" 241 | SAFE = %x20-3a / %x3c-5b / %x5d-7e / ESCSEMICOLON / ESCBACKSLASH 242 | 243 | CTXID = 1*64SAFE 244 | TYPEENUM = "service" / "session" / "shell" / "command" / "vm" / "container" / "elevate" / "chpriv" / "subcontext" / "remote" / "boot" / "app" 245 | 246 | TYPE = "type=" TYPEENUM 247 | USER = "user=" 1*255SAFE 248 | HOSTNAME = "hostname=" 1*255SAFE 249 | MACHINEID = "machineid=" 1D128 250 | BOOTID = "bootid=" ID128 251 | PID = "pid=" UINT64 252 | PIDFDID = "pidfdid=" UINT64 253 | COMM = "comm=" 1*255SAFE 254 | CWD = "cwd=" 1*255SAFE 255 | CMDLINE = "cmdline=" *255SAFE 256 | VM = "vm=" 1*255SAFE 257 | CONTAINER = "container=" 1*255SAFE 258 | TARGETUSER = "targetuser=" 1*255SAFE 259 | TARGETHOST = "targethost=" 1*255SAFE 260 | SESSIONID = "sessionid=" 1*255SAFE 261 | 262 | STARTFIELD = TYPE / USER / HOSTNAME / MACHINEID / BOOTID / PID / PIDFDID / COMM / CWD / CMDLINE / VM / CONTAINER / TARGETUSER / TARGETHOST / SESSIONID 263 | STARTSEQ = OSC "3008;start=" CTXID *(";" STARTFIELD) ST 264 | 265 | EXITENUM = "success" / "failure" / "crash" / "interrupt" 266 | SIGNALENUM = "SIGBUS" / "SIGTRAP" / "SIGABRT" / "SIGSEGV" / … 267 | 268 | EXIT = "exit=" EXITENUM 269 | STATUS = "status=" UINT64 270 | SIGNAL = "signal=" SIGNALENUM 271 | 272 | ENDFIELD = EXIT / STATUS / SIGNAL 273 | ENDSEQ = OSC "3008;end=" CTXID *(";" ENDFIELD) ST 274 | ``` 275 | 276 | 347 | -------------------------------------------------------------------------------- /specs/vmclock.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.13 VMClock 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 13 8 | aliases: 9 | - /UAPI.13 10 | - /13 11 | --- 12 | 13 | # UAPI.13 VMClock: Efficient Time Synchronisation for Virtual Machines 14 | 15 | | Version | Changes | 16 | |---------|-----------------| 17 | | 1.0 | Initial Release | 18 | 19 | The requirements for accurate synchronisation of application clocks against 20 | real wallclock time are becoming ever more demanding. Increasingly cloud 21 | providers are exposing precision clock devices to virtual machines to allow the 22 | guest operating systems to synchronise their clocks. 23 | 24 | Time on modern systems is typically derived from a CPU-internal counter (*TSC, 25 | timebase, arch counter*) which runs at a nominally constant frequency of 26 | typically between 1GHz and 4GHz. In practice, the frequency of the underlying 27 | hardware counter will vary with environmental conditions, with a tolerance of 28 | the order of ±50PPM (parts per million). It is this variance which must constantly be corrected by 29 | synchronising against an external clock. 30 | 31 | Synchronisation against an external clock typically works by reading the CPU 32 | counter, then reading the external clock, and finally reading the CPU counter 33 | again — then assuming that the external clock reading was concurrent with a 34 | point in time between the two CPU counter readings to give a pair of `{ CPU counter, real time }` 35 | values. Successive such readings are used to calibrate the 36 | precise rate at which the CPU counter is running, in order to use it for 37 | precision timekeeping. 38 | 39 | When applied at scale to virtual machines, there are a number of problems with 40 | this approach. Firstly, where virtual CPUs are overcommitted across a smaller 41 | number of physical CPUs in a host, guests experience "steal time" — time when 42 | their vCPU is not actually running. That steal time is unpredictable and can 43 | occur in the critical period between one read of the CPU counter and the next, 44 | affecting the precision of the estimated reading. 45 | 46 | A remedy for this issue is to repeat the reading a number of times, and to use 47 | the result where the latency between first and last CPU counter reading is the 48 | lowest. This exacerbates the second problem, that a large number of separate 49 | guest operating systems on the same host are now repeating the same work of 50 | calibrating the *same* underlying hardware oscillator. 51 | 52 | The third major problem of guest-calibrated time is Live Migration, in which a 53 | guest is transparently moved from one host to another for maintenance reasons. 54 | When this happens, the guest can experience a step change in both the frequency 55 | and the value of the CPU counter. The frequency because the migrated guest is 56 | now using a different underlying counter, and the value because correctly 57 | setting the counter value seen by the guest is dependent on the time 58 | synchronisation of each hypervisor host. After a Live Migration, a guest's 59 | clock should be considered inaccurate until it has been resynchronised from 60 | scratch. Failure to do so can lead to data corruption, in cases where database 61 | coherency depends on accurately timestamped transactions. 62 | 63 | ## The VMClock device 64 | 65 | The VMClock device resolves the above issues by allowing the hypervisor to 66 | synchronise the hardware clock against external time, and simply present the 67 | results to each guest in a shared memory region in the form of a formula for 68 | converting the CPU counter into real time. This allows guests to have precision 69 | timestamps even immediately after a Live Migration event, and with no need to 70 | provide further clock devices to the guest or for guests to spend their own CPU 71 | time on calibration. 72 | 73 | For guests which do perform their own additional refinement of the clock via 74 | NTP or other means, a disruption signal is provided which allows them to 75 | discard any such refinement after Live Migration, and start again with the data 76 | from the new hypervisor host. 77 | 78 | ## The vmclock_abi structure 79 | 80 | The hypervisor provides a structure in shared memory which is readable by the 81 | guest, and advertises it via either ACPI or device-tree devices as described 82 | below. Where possible, these fields and their values are aligned with the 83 | definitions in the [virtio-rtc](https://virtio-rtc) standard. As with virtio, 84 | all fields are stored in little-endian form. 85 | 86 | The fields up to and including `time_type` are constant and shall not change 87 | during the lifetime of the device. The subsequent fields may be updated 88 | dynamically, using `seq_count` as a synchronisation mechanism as follows: 89 | 90 | 1. Increase `seq_count` to an odd value. 91 | 2. Update the remaining fields in the structure. 92 | 3. Increase `seq_count` again to an even value. 93 | 4. If `VMCLOCK_FLAG_NOTIFICATION_PRESENT` is set in the `flags` field, raise an interrupt or ACPI notification. 94 | 95 | If memory barriers are necessary to ensure that changes to the memory are 96 | visible to the guest, they should be present at each stage. The total amount of 97 | time during which `seq_count` remains at an odd value shall be short enough 98 | that it is reasonable for a guest to *spin* while waiting for the update to 99 | complete, as described below. 100 | 101 | ### Structure Fields 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 122 | 123 | 124 | 125 | 126 | 130 | 131 | 132 | 133 | 134 | 146 | 147 | 148 | 149 | 150 | 166 | 167 | 168 | 169 | 170 | 179 | 180 | 181 | 182 | 183 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 214 | 215 | 216 | 217 | 218 | 220 | 221 | 222 | 223 | 224 | 226 | 227 | 228 | 229 | 230 | 232 | 233 | 234 | 235 | 236 | 238 | 239 | 240 | 241 | 242 | 244 | 245 | 246 | 247 | 248 | 250 | 251 | 252 | 253 | 254 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 267 | 268 | 269 | 270 | 271 | 273 | 274 | 275 | 276 | 277 | 279 | 280 | 281 | 282 | 283 | 288 | 289 | 290 |
OffsetFieldDescription
0x00uint32_t magicMagic value 0x4b4c4356 (“VCLK”)
0x04uint32_t sizeSize of region containing this structure (typically a full page at 121 | the granularity at which the hypervisor maps memory to the guest)
0x08uint16_t versionThis standard defines version 1. Since the flags field 127 | allows for extensions to the data structure without breaking backward 128 | compatibility, it is not anticipated that the version field 129 | will ever need to change.
0x0auint8_t counter_idThe hardware counter used as the basis for clock readings. The 135 | values of this field correspond to the 136 | VIRTIO_RTC_COUNTER_xxx values: 137 |
    138 |
  • 0x00: VMCLOCK_COUNTER_ARM_VCNT: The Arm 139 | architectural timer (virtual)
  • 140 |
  • 0x01: VMCLOCK_COUNTER_X86_TSC: The x86 141 | Time Stamp Counter
  • 142 |
  • 0xFF: VMCLOCK_COUNTER_INVALID: No 143 | precision clock is advertised
  • 144 |
145 |
0x0buint8_t time_typeIndicates the type of clock exposed through this interface. The 151 | values of this field correspond to the VIRTIO_RTC_CLOCK_xxx 152 | values, except that smearing of clocks is not supported as it is 153 | antithetical to precision: 154 |
    155 |
  • 0x00: VMCLOCK_TIME_UTC (Not 156 | recommended)
  • 157 |
  • 0x01: VMCLOCK_TIME_TAI
  • 158 |
  • 0x02: VMCLOCK_MONOTONIC
  • 159 |
160 | For UTC and TAI, the calculation results in a number of seconds 161 | since midnight on 1970-01-01. A monotonic clock has no defined epoch. 162 | Since UTC has leap seconds and a given numbered second may occur more 163 | than once, its use is NOT RECOMMENDED in VMClock. 164 | Implementations should advertise TAI, with a correct UTC offset. 165 |
0x0cuint32_t seq_countThis field is used to provide a sequence-based read/write lock for 171 | the non-constant fields which follow. To perform an update, the device 172 | will: 173 |
    174 |
  • Increment this field to an odd value (with the low bit set)
  • 175 |
  • Change other fields as appropriate.
  • 176 |
  • Increment this field again to an even value.
  • 177 |
178 |
0x10uint64_t disruption_markerThis field is changed each time there may be a disruption to the 184 | hardware counter referenced by counter_id, for example 185 | through live migration to a new hypervisor host.
0x18uint64_t flagsFeature flags (see below)
0x20uint16_t padUnused
0x22uint8_t clock_statusSynchronisation status of the clock (see below)
0x23uint8_t leap_second_smearing_hintSmearing hint for guest OS (see below)
0x24int16_t tai_offset_secSigned offset from TAI to UTC at the reference time specified in 211 | time_sec and time_frac_sec, in seconds. Valid 212 | if the corresponding bit in the flags field is set. Implementations 213 | SHOULD populate this field; the value at time of writing is 37.
0x26uint8_t leap_indicatorIndicates the presence and direction of a leap second occurring in 219 | the near future or recent past (see below)
0x27uint8_t counter_period_shiftAdditional shift applied to all the 225 | counter_period*_frac_sec fixed-point fields.
0x28uint64_t counter_valueValue of the hardware counter at the time represented by 231 | time_sec + time_frac_sec.
0x30uint64_t counter_period_frac_secPeriod of a single counter tick, in units of 1 >> (64 + 237 | counter_period_shift)
0x38uint64_t counter_period_esterror_rate_frac_secEstimated ± error of counter_period_frac_sec in the 243 | same units.
0x40uint64_t counter_period_maxerror_rate_frac_secMaximum ± error of counter_period_frac_sec in the same 249 | units.
0x48uint64_t time_secReference time point, seconds since epoch defined by 255 | time_type field.
0x50uint64_t time_frac_secFractional part of reference time, in units of second / 2⁶⁴.
0x58uint64_t time_esterror_nanosecEstimated ± error of the time given in time_sec + 266 | time_frac_sec, in nanoseconds
0x60uint64_t time_maxerror_nanosecMaximum ± error of the time given in time_sec + 272 | time_frac_sec, in nanoseconds
0x64uint64_t vm_generation_countA change in this field indicates that the guest has been cloned or 278 | loaded from a snapshot (see below).
0x68The size of the memory region containing this structure is given in 284 | the size field, which will typically be a full 4KiB page. 285 | New fields may be added here, advertised by newly-defined bits in the 286 | flags field, without changing the version 287 | field.
291 | 292 | ### Feature Flags (0x18) 293 | 294 | | Bit | Flag | Description | 295 | |-----|---------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------| 296 | | 0 | `VMCLOCK_FLAG_TAI_OFFSET_VALID` | Indicates that the `tai_offset` field below contains a correct value. All implementations SHOULD set this bit. | 297 | | 1 | `VMCLOCK_FLAG_DISRUPTION_SOON` | Indicates that a clock disruption event (e.g. live migration) is expected to happen in the next day or so. | 298 | | 2 | `VMCLOCK_FLAG_DISRUPTION_IMMINENT` | Indicates that a clock disruption event is expected to happen within the next hour or so. | 299 | | 3 | `VMCLOCK_FLAG_PERIOD_ESTERROR_VALID` | Indicates that `counter_period_esterror_rate_frac_sec` contains valid data. | 300 | | 4 | `VMCLOCK_FLAG_PERIOD_MAXERROR_VALID` | Indicates that `counter_period_maxerror_rate_frac_sec` contains valid data. | 301 | | 5 | `VMCLOCK_FLAG_TIME_ESTERROR_VALID` | Indicates that `time_esterror_nanosec` contains valid data. | 302 | | 6 | `VMCLOCK_FLAG_TIME_MAXERROR_VALID` | Indicates that `time_maxerror_nanosec` contains valid data. | 303 | | 7 | `VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT` | Indicates that the `vm_generation_counter` field is present. | 304 | | 8 | `VMCLOCK_FLAG_NOTIFICATION_PRESENT` | Indicates that the VMClock device will send an interrupt or ACPI notification every time it updates `seq_count` to a new even value. | 305 | 306 | Unknown flags set by the device can safely be ignored. If a change in behaviour 307 | is required by a future version of this specification, it would come with a new 308 | value of the `version` field or a new `time_type` to avoid breaking 309 | compatibility with existing users. 310 | 311 | ### Clock Status (0x22) 312 | 313 | | Value | Status | Description | 314 | |-------|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 315 | | 0x00 | `VMCLOCK_STATUS_UNKNOWN` | The clock is in an indeterminate state. Clock parameters in the VMClock structure are not valid and should not be relied upon. | 316 | | 0x01 | `VMCLOCK_STATUS_INITIALIZING` | The clock is being initialized and is not yet synchronized. Clock parameters in the VMClock structure are not valid and should not be relied upon. | 317 | | 0x02 | `VMCLOCK_STATUS_SYNCHRONIZED` | The clock is synchronized. Clock parameters in the VMClock structure are expected to be correct and may be relied upon. | 318 | | 0x03 | `VMCLOCK_STATUS_FREERUNNING` | The clock has transitioned away from being synchronized and is in a free-running state. Clock parameters in the VMClock structure are expected to be valid and may be relied upon. | 319 | | 0x04 | `VMCLOCK_STATUS_UNRELIABLE` | The clock is considered broken. Clock parameters in the VMClock structure should not be relied upon. | 320 | 321 | ### Leap Second Smearing Hint (0x23) 322 | 323 | The time exposed through the VMClock device shall never be smeared. This field 324 | corresponds to the `subtype` field in virtio-rtc, which indicates a smearing 325 | method. In this case it merely provides a hint to the guest operating system, 326 | such that if the guest OS wants to provide its users with an alternative clock 327 | which does not follow UTC, it may do so in a fashion consistent with the other 328 | systems in the nearby environment. 329 | 330 | | Value | Hint | 331 | |-------|--------------------------------| 332 | | 0x00 | `VMCLOCK_SMEARING_STRICT` | 333 | | 0x01 | `VMCLOCK_SMEARING_NOON_LINEAR` | 334 | | 0x02 | `VMCLOCK_SMEARING_UTC_SLS` | 335 | 336 | ### Leap Indicator (0x26) 337 | 338 | The value of this field shall be valid for the point in time referenced by the 339 | `time_sec` and `time_frac_sec` fields. 340 | 341 | | Value | Indicator | Description | 342 | |-------|-------------------------|--------------------------------------------------------------------------------| 343 | | 0x00 | `VMCLOCK_LEAP_NONE` | No known nearby leap second | 344 | | 0x01 | `VMCLOCK_LEAP_PRE_POS` | A positive leap second will occur at the end of the present month | 345 | | 0x02 | `VMCLOCK_LEAP_PRE_NEG` | A negative leap second will occur at the end of the present month | 346 | | 0x03 | `VMCLOCK_LEAP_POS` | A positive leap second is currently occurring (set during the 23:59:60 second) | 347 | | 0x04 | `VMCLOCK_LEAP_POST_POS` | A positive leap second occurred at the end of the previous month | 348 | | 0x05 | `VMCLOCK_LEAP_POST_NEG` | A negative leap second occurred at the end of the previous month | 349 | 350 | ### VM Generation Count (0x64) 351 | 352 | This field indicates that the guest has been cloned or loaded from a snapshot. The operating system may wish to regenerate unique identifiers, reset network connections or reseed entropy, etc. 353 | 354 | The conditions under which this counter changes are identical to those of the [VMGenID device](vmgenid.md). The `vm_generation_count` changes whenever the VM is restored to an earlier or non-unique state: 355 | 356 | - Snapshot restoration 357 | - Backup recovery 358 | - VM cloning/copying/import 359 | - Disaster recovery failover 360 | 361 | The `vm_generation_count` remains constant during normal VM operations: 362 | 363 | - Pause/resume 364 | - Shutdown/restart/reboot 365 | - Host reboot or upgrade 366 | - Live migration or lossless online failover 367 | 368 | The `disruption_marker` and `vm_generation_count` fields indicate two orthogonal, but sometimes correlated, types of event. It is generally likely that the `disruption_marker` would also be changed when the `vm_generation_count` changes, but not necessarily vice versa. 369 | 370 | It is possible that a VM could be cloned (forked) while running on the same host, such that the precision of the hardware counter is not lost, but the uniqueness is. That would be the rare case where the `vm_generation_count` would be changed but not the `disruption_marker`. 371 | 372 | ## Calculating time 373 | 374 | The VMClock structure provides the following values: 375 | 376 | - Reference time T₁ in the `time_sec` and `time_frac_sec` fields 377 | - Counter value C₁ of the hardware counter at time T₁ in the `counter_value` field. 378 | - The period P of a single counter tick is given by `counter_period_frac_sec` >> `counter_period_shift`. 379 | 380 | For example, a 1GHz clock would have a period of 1ns, which could naïvely be 381 | represented as `0x44B82FA0A / 2⁶⁴` by putting that value in 382 | `counter_period_frac_sec`. Over long periods of time, however, the loss of 383 | precision would be noticeable. So the same 1ns period should be more precisely 384 | represented as `0x89705F4136B4A597 / 2^(64+29)` by using that value in 385 | `counter_period_frac_sec` and setting `counter_period_shift` to 29. 386 | 387 | To calculate the time, the guest shall first read the `seq_count` field and 388 | wait until it returns an even value, then read the hardware counter C_now and 389 | calculate the time accordingly as **T₁ + P(C_now - C₁)**. Finally, read the 390 | `seq_count` field again. If the value of the `seq_count` field has changed, 391 | discard the result and repeat the procedure from the beginning. 392 | 393 | Where UTC is involved, a correct implementation will need to cope with the case 394 | where a leap second has occurred since the reference time T₁, and the result 395 | needs to be adjusted accordingly. The `leap_indicator` field exists to resolve 396 | the technical ambiguity but using TAI is simpler and less error prone. It is 397 | strongly recommended that implementations use TAI as the time standard and 398 | advertise a correct TAI offset, to avoid this complexity. 399 | 400 | ## Time error calculation 401 | 402 | The VMClock structure optionally advertises maximum error bounds for the clock 403 | data it provides, in the form of deltas to the T₁ and P values used above. The 404 | true time is guaranteed to be within: 405 | 406 | **T₁ ± T_maxerr + P ± P_maxerr(C_now - C₁)** 407 | 408 | where T_maxerr and P_maxerr are the `time_maxerror_nanosec` and `counter_period_maxerror_rate_frac_sec` fields, respectively. 409 | 410 | The device may update the time calibration fields at any time, by incrementing 411 | the `seq_count` to an odd value, adjusting the parameters, then incrementing 412 | `seq_count` again to an even value. For any given historical counter reading 413 | and the error bounds calculated according to VMClock at that moment, it is 414 | guaranteed that any *subsequent* update to the VMClock fields shall also result 415 | in a calculation for that same counter value which falls between the earliest 416 | and latest times that were previously indicated. 417 | 418 | ## Discovery via ACPI 419 | 420 | To expose VMClock to the operating system via ACPI, the firmware or hypervisor must: 421 | 422 | 1. Place the shared `vmclock_abi` structure somewhere in RAM, ROM or device memory space, which is guaranteed not to be used by the operating system. It must not be in ranges reported as `AddressRangeMemory` or `AddressRangeACPI`, and must not be in the same page as any memory which is expected to be mapped by a page table entry with caching disabled. 423 | 424 | 2. Expose a device somewhere in the ACPI namespace with: 425 | - a hardware ID (`_HID`) of "AMZNC10C" 426 | - a DOS Device Name ID (`_DDN`) of "VMCLOCK" 427 | - a compatible ID (`_CID`) of "VMCLOCK" 428 | 429 | 3. Attach to the device a "`_CRS`" method which when evaluated describes the shared memory page where the hypervisor has stored the `vmclock_abi` structure. 430 | 431 | 4. Optionally, the device can raise an ACPI Notify operation using notification code 0x80, every time the `seq_count` field changes to a new even number. If implemented, the hypervisor must advertise the notification feature to the driver by setting the `VMCLOCK_FLAG_NOTIFICATION_PRESENT` bit in the `flags` field. 432 | 433 | ## Discovery via Device Tree 434 | 435 | Similar to the ACPI binding above, the firmware or hypervisor must place the 436 | `vmclock_abi` structure in an otherwise unused region of physical memory and 437 | advertise its presence to the operating system. The Device Tree binding for the 438 | `amazon,vmclock` node is as follows: 439 | 440 | ```yaml 441 | # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 442 | %YAML 1.2 443 | --- 444 | $id: http://devicetree.org/schemas/clock/amazon,vmclock.yaml# 445 | $schema: http://devicetree.org/meta-schemas/core.yaml# 446 | 447 | title: Virtual Machine Clock 448 | 449 | maintainers: 450 | - David Woodhouse 451 | 452 | description: 453 | The vmclock device provides a precise clock source and allows for 454 | accurate timekeeping across live migration and snapshot/restore 455 | operations. The full specification of the shared data structure 456 | is available at https://david.woodhou.se/VMClock.pdf 457 | 458 | properties: 459 | compatible: 460 | const: amazon,vmclock 461 | 462 | reg: 463 | description: 464 | Specifies the shared memory region containing the vmclock_abi structure. 465 | maxItems: 1 466 | 467 | interrupts: 468 | description: 469 | Interrupt used to notify when the contents of the vmclock_abi structure 470 | have been updated. 471 | maxItems: 1 472 | 473 | required: 474 | - compatible 475 | - reg 476 | 477 | additionalProperties: false 478 | 479 | examples: 480 | - | 481 | #include 482 | ptp@80000000 { 483 | compatible = "amazon,vmclock"; 484 | reg = <0x80000000 0x1000>; 485 | interrupts = ; 486 | }; 487 | ``` 488 | 489 | ## Hardware implementation 490 | 491 | It is possible for a hardware implementation of VMClock to exist, in the 492 | absence of a hypervisor or virtualization. Using mechanisms such as PCIe PTP, a 493 | device could synchronise the CPU's counter directly against real time and 494 | advertise the result to the operating system. 495 | 496 | Such an implementation is outside the scope of this specification for now, but 497 | only just. We may need to add a new option for the `counter_id` field which 498 | references the hardware clock available to the PCIe device for PTM 499 | synchronisation, for example the Intel Always Running Timer. 500 | -------------------------------------------------------------------------------- /specs/linux_file_system_hierarchy.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.9 Linux File System Hierarchy 3 | category: Concepts 4 | layout: default 5 | version: 0.1 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 9 8 | aliases: 9 | - /UAPI.9 10 | - /9 11 | --- 12 | 13 | # UAPI.9 Linux File System Hierarchy 14 | 15 | | Version | Changes | 16 | |---------|------------------| 17 | | 0.1 | Work in progress | 18 | 19 | ## Description 20 | 21 | This page describes the layout of a modern Linux system. 22 | This hierarchy is an evolution of the historical UNIX layout, 23 | and includes concepts described in the 24 | [File System Hierarchy](http://refspecs.linuxfoundation.org/FHS_3.0/fhs-3.0.html) 25 | specification and 26 | [`hier(7)`](https://man7.org/linux/man-pages/man7/hier.7.html) man page, 27 | and various extensions documented in the 28 | [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/) 29 | and [XDG User Directories](https://www.freedesktop.org/wiki/Software/xdg-user-dirs). 30 | 31 | In some areas this document is stricter than those older documents. 32 | In particular it makes additional restrictions and recommendations 33 | to separate vendor defaults and local configuration, and also 34 | static installed files, persistent data, and ephemeral runtime files. 35 | In other areas it is simpler, 36 | with the vendor files using a single hierarchy under `/usr`. 37 | 38 | This document doesn't define the directory structure comprehensively, 39 | it only documents a skeleton of a directory tree, 40 | to be extended by additional directories lower in the hierarchy. 41 | Some directories like `/var/spool/` are not covered, 42 | even though it might make sense to include them 43 | in the structure of an actually deployed OS. 44 | 45 | Some directories are described for compatibility with current Linux distributions, 46 | but their use is not recommended. 47 | The subsections that describe those directories are marked with ⚠️. 48 | 49 | Many of the paths described here can be queried with the 50 | [`systemd-path(1)`](https://www.freedesktop.org/software/systemd/man/systemd-path.html) 51 | tool, on systems where this tool is available. 52 | 53 | ## Character Sets 54 | 55 | All paths and filenames currently defined by this specification use the 7bit ASCII (ANSI_X3.4-1968) character 56 | set, and it is expected that future additions will continue to do so. 57 | 58 | Names of files or directories placed within the directories defined should generally be in UTF-8 ([IETF RFC 59 | 3629](https://www.rfc-editor.org/rfc/rfc3629.html)) encoding. Applications should generally assume that the 60 | names of any files or directories placed in any of these directories (or below) are in UTF-8, unless 61 | explicitly configured otherwise. 62 | 63 | ## General Structure 64 | 65 | ### `/` 66 | 67 | The file system root. 68 | Usually writable, but this is not required. 69 | Possibly a temporary file system (`tmpfs`). 70 | Not shared with other hosts (unless read-only). 71 | 72 | ### `/boot/` 73 | 74 | The partition where kernels and other assets used to bring up the system are stored. 75 | On EFI systems, this is possibly the EFI System Partition (ESP), 76 | also see 77 | [`systemd-gpt-auto-generator(8)`](https://www.freedesktop.org/software/systemd/man/systemd-gpt-auto-generator.html). 78 | This directory is usually strictly local to the host, 79 | and should be considered read-only, 80 | except when a kernel or boot loader is installed or updated. 81 | This directory is only populated on systems that run on 82 | physical or emulated hardware that requires a boot loader. 83 | 84 | ### `/efi/` 85 | 86 | If the EFI System Partition (ESP) is maintained separately 87 | from the boot partition described in the previous section, 88 | it is mounted here. 89 | Tools that need to operate on the ESP should look for it at this mount point first, 90 | and fall back to `/boot/` — if the first location does not qualify 91 | (for example if it is not a mount point 92 | or does not have the correct file system type `MSDOS_SUPER_MAGIC`). 93 | 94 | ### `/etc/` 95 | 96 | System-specific configuration. 97 | This directory may be read-only. 98 | Frequently, this directory is pre-populated with vendor-supplied configuration files, 99 | but applications should not make assumptions 100 | about this directory being fully populated or populated at all, 101 | and should fall back to defaults if configuration is missing, 102 | following the 103 | [Configuration Files Specification](configuration_files_specification.md). 104 | 105 | If `/opt/` is supported (see below), 106 | then the `/etc/opt/` ⚠️ subdirectory is the location where 107 | third-party software installed in `/opt/` may store its configuration. 108 | The same naming convention as directories under `/opt/` is used for directories under `/etc/opt/`. 109 | 110 | ### `/home/` 111 | 112 | The location for normal user's home directories. 113 | Possibly shared with other systems, and never read-only. 114 | This directory should only be used for normal users, never for system users. 115 | This directory and possibly the directories contained within it 116 | might only become available or writable in late boot or even only after user authentication. 117 | This directory might be placed on limited-functionality network file systems, 118 | hence applications should not assume the full set of file API is available on this directory. 119 | Applications should generally not reference this directory directly, 120 | but via the per-user `$HOME` environment variable, 121 | or via the home directory field of the user database. 122 | 123 | ### `/opt/` ⚠️ 124 | 125 | A secondary location for third-party vendor directories. 126 | This directory is optional, 127 | as not all systems allow installing third-party software. 128 | 129 | Each third-party vendor (i.e.: unrelated to the OS provider) may use a subdirectory, 130 | typically named after the vendor or the software, under this location. 131 | It is usually read-only, but this is not required. 132 | This directory should not be modified by the administrator, 133 | except when installing or removing third-party-supplied software. 134 | 135 | Using `/opt/` is not recommended. 136 | It is not integrated with the rest of the distribution: 137 | a package which uses `/opt/` may need to install 138 | binaries or links in `/usr/bin/` 139 | and other supplementary files, 140 | e.g. desktops files or manual pages, 141 | into their appropriate locations under `/usr`. 142 | Instead of using a subdirectory under `/opt/`, 143 | a third party vendor should put their directory under `/usr/lib/`. 144 | 145 | ### `/root/` 146 | 147 | The home directory of the root user. 148 | The root user's home directory is located outside of `/home/` 149 | in order to make sure the root user may log in 150 | even without `/home/` being available and mounted. 151 | 152 | ### `/srv/` 153 | 154 | The place to store general server payload, managed by the administrator. 155 | No restrictions are made how this directory is organized internally. 156 | Generally writable, and possibly shared among systems. 157 | This directory might become available or writable only very late during boot. 158 | 159 | ### `/tmp/` 160 | 161 | The place for small temporary files. 162 | This directory is usually mounted as a `tmpfs` instance, 163 | and should hence not be used for larger files. 164 | (Use `/var/tmp/` for larger files.) 165 | This directory is usually flushed at boot-up. 166 | Also, files that are not accessed within a certain time may be automatically deleted. 167 | 168 | If applications find the environment variable `$TMPDIR` set, 169 | they should use the directory specified in it instead of `/tmp/` 170 | (see 171 | [`environ(7)`](https://man7.org/linux/man-pages/man7/environ.7.html) and 172 | [IEEE Std 1003.1](http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_03) 173 | for details). 174 | 175 | Since `/tmp/` is accessible to other users of the system, 176 | it is essential that files and subdirectories under this directory are only created with 177 | [`mkstemp(3)`](https://man7.org/linux/man-pages/man3/mkstemp.3.html), 178 | [`mkdtemp(3)`](https://man7.org/linux/man-pages/man3/mkdtemp.3.html), 179 | and similar calls. 180 | For more details, see 181 | [Using /tmp/ and /var/tmp/ Safely](https://systemd.io/TEMPORARY_DIRECTORIES). 182 | 183 | ## Runtime Data 184 | 185 | ### `/run/` 186 | 187 | A `tmpfs` file system for system packages to place 188 | runtime data, socket files, and similar. 189 | This directory is flushed on boot, 190 | and generally writable for privileged programs only. 191 | Always writable. 192 | 193 | ### `/run/log/` 194 | 195 | Runtime system logs. 196 | System components may place private logs in this directory. 197 | Always writable, even when `/var/log/` might not be accessible yet. 198 | 199 | ### `/run/user/` 200 | 201 | Contains per-user runtime directories, 202 | each usually individually mounted `tmpfs` instances. 203 | Always writable, flushed at each reboot and when the user logs out. 204 | User code should not reference this directory directly, 205 | but via the `$XDG_RUNTIME_DIR` environment variable, 206 | as documented in the 207 | [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/). 208 | 209 | ## Vendor-supplied Operating System Resources 210 | 211 | ### `/usr/` 212 | 213 | Vendor-supplied operating system resources. 214 | Usually read-only, but this is not required. 215 | Possibly shared between multiple hosts. 216 | This directory should not be modified by the administrator, 217 | except when installing or removing vendor-supplied packages. 218 | 219 | ### `/usr/bin/` 220 | 221 | Binaries and executables for user commands that shall appear in the `$PATH` search path. 222 | It is recommended not to place binaries in this directory 223 | that are not useful for invocation from a shell (such as daemon binaries); 224 | these should be placed in a subdirectory of `/usr/lib/` instead. 225 | 226 | ### `/usr/include/` 227 | 228 | Programmatic APIs header files, typically used when compiling `C/C++` programs. 229 | Packages may place such files either directly in `/usr/include/`, 230 | or in a subdirectory. 231 | Using a subdirectory is recommended if the package installs multiple files. 232 | It is also necessary if multiple versions of the same headers shall be coinstallable. 233 | The subdirectory may be named after the package or project providing it. 234 | Packages may place architecture-dependent header files and directories in a 235 | `/usr/include//` subdirectory, 236 | following the identifiers defined on the 237 | [Multiarch Architecture Specifiers (Tuples)](https://wiki.debian.org/Multiarch/Tuples) 238 | list. 239 | 240 | Note that to use the headers, flags for the compiler may be required 241 | to bring the appropriate subdirectory into the search path. 242 | See man pages 243 | [`pc(7)`](https://www.mankier.com/5/pc), 244 | [`pkgconf(1)`](https://www.mankier.com/1/pkgconf), 245 | [gcc's Search Path](https://gcc.gnu.org/onlinedocs/cpp/Search-Path.html), 246 | and other compiler-specific documentation. 247 | 248 | The corresponding library or libraries shall be placed in `$libdir`, see below. 249 | 250 | ### Libraries, system programs, and program assets 251 | 252 | This section describes the directories used to store shared libraries, 253 | internal binaries or other programs that are not regularly invoked from a shell 254 | (and thus should not be placed in the directories in `$PATH`), 255 | and other static files that are part of program installations. 256 | 257 | There are two main schemes for shared libraries. 258 | On "multiarch" systems, 259 | multiple different architecture and ABI variants can be installed in parallel. 260 | Each variant uses a hierarchy of files under a directory named using the 261 | [Multiarch Architecture Specifiers (Tuples)](https://wiki.debian.org/Multiarch/Tuples) 262 | list. 263 | On "multilib" systems, 264 | a simpler scheme is used that only supports 32-bit and 64-bit variants of the same architecture. 265 | 266 | Both schemes are widely used, and distributions typically choose one or the other. 267 | Multiarch is the recommended approach, especially for new systems. 268 | 269 | Note that many locations described in this section 270 | are under shared ownership, 271 | with multiple different packages installing and consuming resources 272 | on equal footing without any obvious primary owner, 273 | and are subject to specifications that ensure interoperability. 274 | 275 | #### `/usr/lib/` 276 | 277 | Static, private vendor data that is compatible with all architectures 278 | (though not necessarily architecture-independent). 279 | Note that this includes 280 | internal binaries or other programs that are not regularly invoked from a shell. 281 | Such binaries may be for any architecture supported by the system. 282 | 283 | In the multilib scheme, 284 | 32-bit libraries are placed directly in this directory. 285 | In the multiarch scheme, 286 | libraries should not be placed directly in this directory, 287 | but in `$libdir` (see below), instead. 288 | 289 | #### `/usr/lib//` 290 | 291 | Location for dynamic libraries, also called `$libdir`. 292 | The architecture identifier to use is defined on the 293 | [Multiarch Architecture Specifiers (Tuples)](https://wiki.debian.org/Multiarch/Tuples) 294 | list. 295 | Those directories are used on multiarch systems. 296 | 297 | On multilib systems, 298 | `/usr/lib/` and `/usr/lib64/` are used instead, 299 | and one of them is `$libdir`. 300 | 301 | This directory can be used for architecture-dependent package-specific data too. 302 | 303 | The primary architecture of the system (`$libdir`) may be queried with: 304 | 305 | systemd-path system-library-arch 306 | 307 | #### `/usr/libexec/` ⚠️ 308 | 309 | A secondary location for 310 | vendor binaries or other programs that are not regularly invoked from a shell 311 | that is used by some distributions. 312 | Packages may either place such programs 313 | in a subdirectory of `/usr/lib/`, 314 | directly in `/usr/libexec/`, 315 | or in a subdirectory of `/usr/libexec/` named after the package. 316 | The first option is the recommended approach. 317 | `/usr/libexec/` is used by some distributions, 318 | so it is mentioned here too, 319 | but its use is not encouraged. 320 | 321 | Binaries in `/usr/libexec/` may be for any architecture supported by the system. 322 | 323 | ### `/usr/share/` 324 | 325 | Architecture-independent resources of packages, 326 | such as documentation, man pages, time zone information, and fonts. 327 | 328 | Those files are often shared between multiple packages, 329 | so the precise location and format of files stored below this directory 330 | are subject to specifications that ensure interoperability. 331 | 332 | ### `/usr/share/doc/` 333 | 334 | Documentation for the operating system or system packages. 335 | 336 | ### `/usr/share/factory/etc/` 337 | 338 | Repository for vendor-supplied default configuration files. 339 | This directory should be populated with pristine vendor versions 340 | of all configuration files that may be placed in `/etc/`. 341 | This is useful to compare the local configuration of a system with vendor defaults 342 | and to populate the local configuration with defaults. 343 | 344 | Software should not read configuration settings directly from `/usr/share/factory/`. 345 | Those files will be copied to other locations if appropriate, 346 | and should only be read from there. 347 | 348 | ### `/usr/share/factory/var/` 349 | 350 | Similar to `/usr/share/factory/etc/`, 351 | but for vendor versions of files in the variable, persistent data directory `/var/`. 352 | The same recommendations as for `/usr/share/factory/etc/` apply here. 353 | 354 | ## Persistent Variable System Data 355 | 356 | ### `/var/` 357 | 358 | Persistent, variable system data. 359 | Writable during normal system operation. 360 | This directory might be pre-populated with vendor-supplied data, 361 | but applications should be able to reconstruct 362 | necessary files and directories in this subhierarchy should they be missing, 363 | as the system might start up without this directory being populated. 364 | Persistency is recommended, but optional, to support ephemeral systems. 365 | This directory might become available or writable only very late during boot. 366 | Components that are required to operate during early boot 367 | hence shall not unconditionally rely on this directory. 368 | 369 | ### `/var/cache/` 370 | 371 | Persistent system cache data. 372 | System components may place non-essential data in this directory. 373 | Flushing this directory should have no effect on operation of programs, 374 | except for increased runtimes necessary to rebuild these caches. 375 | 376 | ### `/var/lib/` 377 | 378 | Persistent system data. 379 | System components may place private data in this directory. 380 | 381 | ### `/var/log/` 382 | 383 | Persistent system logs. 384 | System components may place private logs in this directory, 385 | though it is recommended to do most logging via the 386 | [`syslog(3)`](https://man7.org/linux/man-pages/man3/syslog.3.html) and 387 | [`sd_journal_print(3)`](https://www.freedesktop.org/software/systemd/man/sd_journal_print.html) 388 | calls. 389 | 390 | ### `/var/opt/` ⚠️ 391 | 392 | If `/opt/` is supported (see above), 393 | then the `/var/opt/` subdirectory is the location where third-party software installed in `/opt/` 394 | stores its persistent, variable data. 395 | The same naming convention as directories under `/opt/` is used for directories under `/var/opt/`. 396 | 397 | ### `/var/tmp/` 398 | 399 | The place for larger and persistent temporary files. 400 | In contrast to `/tmp/`, 401 | this directory is usually mounted from a persistent physical file system 402 | and can thus accept larger files. 403 | (Use `/tmp/` for small ephemeral files.) 404 | This directory is generally not flushed at boot-up, 405 | but time-based cleanup of files that have not been accessed for a certain time is applied. 406 | 407 | If applications find the environment variable `$TMPDIR` set, 408 | they should use the directory specified in it instead of `/var/tmp/` 409 | (see [`environ(7)`](https://man7.org/linux/man-pages/man7/environ.7.html) 410 | for details). 411 | 412 | The same security restrictions as with `/tmp/` apply: 413 | [`mkstemp(3)`](https://man7.org/linux/man-pages/man3/mkstemp.3.html), 414 | [`mkdtemp(3)`](https://man7.org/linux/man-pages/man3/mkdtemp.3.html), 415 | and similar calls should be used. 416 | For further details about this directory, see 417 | [Using /tmp/ and /var/tmp/ Safely](https://systemd.io/TEMPORARY_DIRECTORIES). 418 | 419 | ## Virtual Kernel and API File Systems 420 | 421 | ### `/dev/` 422 | 423 | The root directory for device nodes. 424 | Usually, this directory is mounted as a `devtmpfs` instance, 425 | but might be of a different type in sandboxed/containerized setups. 426 | This directory is managed jointly by the kernel and 427 | a userspace component such as 428 | [`systemd-udevd(8)`](https://www.freedesktop.org/software/systemd/man/systemd-udevd.html), 429 | and should not be written to by other components. 430 | A number of special purpose virtual file systems might be mounted below this directory. 431 | 432 | ### `/dev/shm/` 433 | 434 | Place for POSIX shared memory segments, as created via 435 | [`shm_open(3)`](https://man7.org/linux/man-pages/man3/shm_open.3.html). 436 | This directory is flushed on boot, 437 | and is a `tmpfs` file system. 438 | Since all users have write access to this directory, 439 | special care should be taken to avoid name clashes and vulnerabilities. 440 | For normal users, shared memory segments in this directory are usually deleted 441 | when the user logs out. 442 | Usually, it is a better idea to use memory mapped files in `/run/` (for system programs) 443 | or `$XDG_RUNTIME_DIR` (for user programs) 444 | instead of POSIX shared memory segments, 445 | since these directories are not world-writable 446 | and hence not vulnerable to security-sensitive name clashes. 447 | 448 | ### `/proc/` 449 | 450 | A virtual kernel file system exposing the process list and other functionality. 451 | This file system is mostly an API to interface with the kernel 452 | and not a place where normal files may be stored. 453 | For details, see 454 | [`proc(5)`](https://man7.org/linux/man-pages/man5/proc.5.html). 455 | A number of special purpose virtual file systems might be mounted below this directory. 456 | 457 | ### `/proc/sys/` 458 | 459 | A hierarchy below `/proc/` that exposes a number of kernel tunables. 460 | The primary way to configure the settings in this API file tree is via 461 | [`sysctl.d(5)`](https://www.freedesktop.org/software/systemd/man/sysctl.d.html) 462 | files. 463 | In sandboxed/containerized setups, this directory is generally mounted read-only. 464 | 465 | ### `/sys/` 466 | 467 | A virtual kernel file system exposing discovered devices and other functionality. 468 | This file system is mostly an API to interface with the kernel 469 | and not a place where normal files may be stored. 470 | In sandboxed/containerized setups, this directory is generally mounted read-only. 471 | A number of special purpose virtual file systems might be mounted below this directory. 472 | 473 | ### `/sys/fs/cgroup/` 474 | 475 | A virtual kernel file system exposing process control groups (cgroups). 476 | This file system is an API to interface with the kernel 477 | and not a place where normal files may be stored. 478 | On current systems running in the default "unified" mode, 479 | this directory serves as the mount point for the `cgroup2` filesystem, 480 | which provides a unified cgroup hierarchy for all resource controllers. 481 | On systems with non-default configurations, 482 | this directory may instead be a tmpfs filesystem 483 | containing mount points for various `cgroup` (v1) resource controllers; 484 | in such configurations, 485 | if `cgroup2` is mounted it will be mounted on `/sys/fs/cgroup/unified/`, 486 | but cgroup2 will not have resource controllers attached. 487 | In sandboxed/containerized setups, 488 | this directory may either not exist or may include a subset of functionality. 489 | 490 | ## Compatibility Symlinks 491 | 492 | ### `/bin/`; `/sbin/`; `/usr/sbin/` 493 | 494 | These compatibility symlinks point to `/usr/bin/`, 495 | ensuring that scripts and binaries referencing these legacy paths 496 | correctly find their binaries. 497 | 498 | ### `/lib/` 499 | 500 | This compatibility symlink points to `/usr/lib/`, 501 | ensuring that programs referencing this legacy path 502 | correctly find their resources. 503 | 504 | ### `/lib64/` 505 | 506 | On some architecture ABIs, this compatibility symlink points to `$libdir`, 507 | ensuring that binaries referencing this legacy path 508 | correctly find their dynamic loader. 509 | This symlink only exists on architectures whose ABI 510 | places the dynamic loader in this path. 511 | 512 | ### `/var/run/` 513 | 514 | This compatibility symlink points to `/run/`, 515 | ensuring that programs referencing this legacy path 516 | correctly find their runtime data. 517 | 518 | ## Home Directory 519 | 520 | User applications may want to place files and directories 521 | in the user's home directory. 522 | They should follow the following basic structure. 523 | Note that some of these directories are also standardized 524 | (though more weakly) by the 525 | [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/). 526 | Additional locations for high-level user resources are defined by 527 | [xdg-user-dirs](https://www.freedesktop.org/wiki/Software/xdg-user-dirs). 528 | 529 | ### `~/.cache/` 530 | 531 | Persistent user cache data. 532 | User programs may place non-essential data in this directory. 533 | Flushing this directory should have no effect on operation of programs, 534 | except for increased runtimes necessary to rebuild these caches. 535 | If an application finds `$XDG_CACHE_HOME` set, 536 | it should use the directory specified in it instead of this directory. 537 | 538 | ### `~/.config/` 539 | 540 | Application configuration. 541 | When a new user is created, this directory will be empty or not exist at all. 542 | Applications should fall back to defaults 543 | should their configuration in this directory be missing. 544 | If an application finds `$XDG_CONFIG_HOME` set, 545 | it should use the directory specified in it instead of this directory. 546 | 547 | ### `~/.local/bin/` 548 | 549 | Executables that shall appear in the user's `$PATH` search path. 550 | It is recommended not to place executables in this directory 551 | that are not useful for invocation from a shell; 552 | these should be placed in a subdirectory of `~/.local/lib/` instead. 553 | Care should be taken when placing architecture-dependent binaries in this place, 554 | which might be problematic if the home directory is shared 555 | between multiple hosts with different architectures. 556 | 557 | ### `~/.local/lib/` 558 | 559 | Static, private vendor data that is compatible with all architectures. 560 | 561 | ### `~/.local/lib//` 562 | 563 | Location for placing public dynamic libraries. 564 | The architecture identifier to use is defined on 565 | [Multiarch Architecture Specifiers (Tuples)](https://wiki.debian.org/Multiarch/Tuples) 566 | list. 567 | 568 | ### `~/.local/share/` 569 | 570 | Resources shared between multiple packages, such as fonts or artwork. 571 | Usually, the precise location and format of files stored below this directory 572 | is subject to specifications that ensure interoperability. 573 | If an application finds `$XDG_DATA_HOME` set, 574 | it should use the directory specified in it instead of this directory. 575 | 576 | ### `~/.local/state/` 577 | 578 | Application state. 579 | When a new user is created, this directory will be empty or not exist at all. 580 | Applications should fall back to defaults 581 | should their state in this directory be missing. 582 | If an application finds `$XDG_STATE_HOME` set, 583 | it should use the directory specified in it instead of this directory. 584 | 585 | ## Write Access 586 | 587 | ### Unprivileged Write Access 588 | 589 | Unprivileged processes generally lack write access to most of the hierarchy. 590 | 591 | The exceptions for normal users are `/tmp/`, `/var/tmp/`, `/dev/shm/`, 592 | as well as the home directory `$HOME` (usually found below `/home/`) 593 | and the runtime directory `$XDG_RUNTIME_DIR` (found below `/run/user/`) 594 | of the user, which are all writable. 595 | 596 | For unprivileged system processes, 597 | only `/tmp/`, `/var/tmp/`, and `/dev/shm/` are writable. 598 | If an unprivileged system process needs a private writable directory 599 | in `/var/` or `/run/`, 600 | it is recommended to to create it via 601 | the `StateDirectory=` and `RuntimeDirectory=` directives of service units 602 | (see 603 | [`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html) 604 | for details), 605 | or via 606 | [`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) 607 | fragments during boot. 608 | A daemon may also create it before dropping privileges, 609 | but it is not recommended to start the daemon with privileges just for this. 610 | 611 | `/tmp/`, `/var/tmp/`, and `/dev/shm/` should be mounted `nosuid` and `nodev`, 612 | which means that set-user-id mode and character or block special devices 613 | are not interpreted on those file systems. 614 | In general it is not possible to mount them `noexec`, 615 | because various programs use those directories 616 | for dynamically generated or optimized code, 617 | and with that flag those use cases would break. 618 | Using this flag is OK on special-purpose installations or systems where 619 | all software that may be installed is known and does not require such functionality. 620 | See the discussion of `nosuid`/`nodev`/`noexec` in 621 | [`mount(8)`](https://man7.org/linux/man-pages/man8/mount.8.html) 622 | and `PROT_EXEC` in 623 | [`mmap(2)`](https://man7.org/linux/man-pages/man2/mmap.2.html). 624 | 625 | ### Lack of Write Access on Read-Only Systems and during System Recovery 626 | 627 | As noted above, some systems operate with 628 | the `/usr` and `/etc` hierarchies mounted read-only, 629 | possibly only allowing write access during package upgrades. 630 | Other part of the hierarchy are generally mounted read-write 631 | (in particular `/var` and `/var/tmp`), 632 | but may be read-only when the kernel remounts the file system read-only 633 | in response to errors, 634 | or when the system is booted read-only for recovery purposes. 635 | To the extent reasonable, 636 | applications should be prepared to execute without write access, 637 | so that for example, failure to save non-essential data to `/var/cache/` 638 | or failure to create a custom log file under `/var/log` 639 | does not prevent the application from running. 640 | 641 | The `/run/` directory is available since the earliest boot 642 | and is always writable. 643 | It should be used for any runtime data and sockets, 644 | so that write access to e.g. `/etc` or `/var` is not needed. 645 | 646 | ## Node Types 647 | 648 | Unix file systems support different types of file nodes, 649 | including regular files, directories, symlinks, character and block device nodes, 650 | sockets and FIFOs. 651 | 652 | It is strongly recommended that `/dev/` is 653 | the only location below which device nodes shall be placed. 654 | Similarly, `/run/` shall be the only location to place sockets and FIFOs. 655 | Regular files, directories and symlinks may be used in all directories. 656 | 657 | Applications should expect that a security policy 658 | might be enforced on a system that enforces these rules. 659 | 660 | ## System Packages 661 | 662 | Developers of system packages should follow strict rules 663 | when placing their files in the file system. 664 | The following table lists recommended locations for 665 | specific types of files supplied by the vendor. 666 | 667 | ### System package vendor files locations 668 | 669 | | Directory | Purpose | 670 | |-------------------------------|---------| 671 | | `/usr/bin/` | Package executables that shall appear in the `$PATH` executable search path, compiled for any of the supported architectures compatible with the operating system. It is not recommended to place internal binaries or binaries that are not commonly invoked from the shell in this directory, such as daemon binaries. As this directory is shared with most other packages of the system, special care should be taken to pick unique names for files placed here, that are unlikely to clash with other package's files. | 672 | | `/usr/lib//` | Public shared libraries of the package. As above, be careful with using too generic names, and pick unique names for your libraries to place here to avoid name clashes. | 673 | | `/usr/lib/package/` | Private static vendor resources of the package, including private binaries and libraries, or any other kind of read-only vendor data. | 674 | | `/usr/lib//package/` | Private other vendor resources of the package that are architecture-specific and cannot be shared between architectures. Note that this generally does not include private executables since binaries of a specific architecture may be freely invoked from any other supported system architecture. | 675 | 676 | Additional static vendor files with shared ownership 677 | may be installed in the `/usr/share/` hierarchy 678 | to the locations defined by the various relevant specifications. 679 | 680 | ### System package variable files locations 681 | 682 | The following directories shall be used by the package for 683 | local configuration and files created during runtime: 684 | 685 | | Directory | Purpose | 686 | |---------------------------------|---------| 687 | | `/etc/package/` | System-specific configuration for the package. It is recommended to default to safe fallbacks if this configuration is missing, if this is possible. Alternatively, a[`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) fragment may be used to copy or symlink the necessary files and directories from `/usr/share/factory/` during boot, via the `L` or `C` directives. | 688 | | `/run/package/` | Runtime data for the package. Packages must be able to create the necessary subdirectories in this tree on their own, since the directory is flushed automatically on boot. Alternatively, a [`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) fragment may be used to create the necessary directories during boot, or the `RuntimeDirectory=` directive of service units may be used to create them at service startup (see [`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html) for details). | 689 | | `/run/log/package/` | Runtime log data for the package. As above, the package needs to make sure to create this directory if necessary, as it will be flushed on every boot. | 690 | | `/var/cache/package/` | Persistent cache data of the package. If this directory is flushed, the application should work correctly on next invocation, though possibly slowed down due to the need to rebuild any local cache files. The application must be capable of recreating this directory should it be missing and necessary. To create an empty directory, a [`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) fragment or the `CacheDirectory=` directive of service units (see [`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html)) may be used. | 691 | | `/var/lib/package/` | Persistent private data of the package. This is the primary place to put persistent data that does not fall into the other categories listed. Packages should be able to create the necessary subdirectories in this tree on their own, since the directory might be missing on boot. To create an empty directory, a [`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) fragment or the `StateDirectory=` directive of service units (see [`systemd.unit(5)`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html)) may be used. | 692 | | `/var/log/package/` | Persistent log data of the package. As above, the package should make sure to create this directory if necessary, possibly using [`tmpfiles.d(5)`](https://www.freedesktop.org/software/systemd/man/tmpfiles.d.html) or `LogsDirectory=` (see [`systemd.exec(5)`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html)), as it might be missing. 693 | 694 | ## User Packages 695 | 696 | Programs running in user context should follow strict rules 697 | when placing their own files in the user's home directory. 698 | The following table lists recommended locations 699 | in the home directory for specific types of files supplied by the vendor 700 | if the application is installed in the home directory. 701 | (User applications installed system-wide are 702 | covered by the rules outlined above for vendor files.) 703 | 704 | ### Vendor package file locations under the home directory of the user 705 | 706 | | Directory | Purpose | 707 | |-----------------------------------|---------| 708 | | `~/.local/bin/` | Package executables that shall appear in the `$PATH` executable search path. It is not recommended to place internal executables or executables that are not commonly invoked from the shell in this directory, such as daemon executables. As this directory is shared with most other packages of the user, special care should be taken to pick unique names for files placed here, that are unlikely to clash with other package's files. | 709 | | `~/.local/lib//` | Public shared libraries of the package. As above, be careful with using overly generic names, and pick unique names for your libraries to place here to avoid name clashes. | 710 | | `~/.local/lib/package/` | Private, static vendor resources of the package, compatible with any architecture, or any other kind of read-only vendor data. | 711 | | `~/.local/lib//package/` | Private other vendor resources of the package that are architecture-specific and cannot be shared between architectures. | 712 | 713 | Additional static vendor files with shared ownership 714 | may be installed in the `~/.local/share/` hierarchy, 715 | mirroring the subdirectories specified in the section 716 | "Vendor-supplied operating system resources" above. 717 | 718 | ### User package variable file locations 719 | 720 | The following directories shall be used by the package for 721 | per-user local configuration and files created during runtime: 722 | 723 | | Directory | Purpose | 724 | |-----------------------------|---------| 725 | | `~/.config/package/` | User-specific configuration for the package. It is required to default to safe fallbacks if this configuration is missing. | 726 | | `$XDG_RUNTIME_DIR/package/` | User runtime data for the package. | 727 | | `~/.cache/package/` | Persistent cache data of the package. If this directory is flushed, the application should work correctly on next invocation, though possibly slowed down due to the need to rebuild any local cache files. The application must be capable of recreating this directory should it be missing and necessary. | 728 | | `~/.local/state/package/` | Persistent state data of the package. | 729 | 730 | ## See Also 731 | 732 | The [`systemd(1)`](https://www.freedesktop.org/software/systemd/man/systemd.html) 733 | system and service manager implements and expects the layout described in this specification. 734 | -------------------------------------------------------------------------------- /specs/discoverable_partitions_specification.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: UAPI.2 Discoverable Partitions Specification 3 | category: Concepts 4 | layout: default 5 | version: 1.0 6 | SPDX-License-Identifier: CC-BY-4.0 7 | weight: 2 8 | aliases: 9 | - /UAPI.2 10 | - /2 11 | --- 12 | # UAPI.2 The Discoverable Partitions Specification (DPS) 13 | 14 | | Version | Changes | 15 | |---------|-----------------| 16 | | 1.0 | Initial Release | 17 | 18 | _TL;DR: Let's automatically discover, mount and enable the root partition, 19 | `/home/`, `/srv/`, `/var/` and `/var/tmp/` and the swap partitions based on 20 | GUID Partition Tables (GPT)!_ 21 | 22 | This specification describes the use of GUID Partition Table (GPT) UUIDs to 23 | enable automatic discovery of partitions and their intended mountpoints. 24 | Traditionally Linux has made little use of partition types, mostly just 25 | defining one UUID for file system/data partitions and another one for swap 26 | partitions. With this specification, we introduce additional partition types 27 | for specific uses. This has many benefits: 28 | 29 | * OS installers can automatically discover and make sense of partitions of 30 | existing Linux installations. 31 | * The OS can discover and mount the necessary file systems with a non-existent 32 | or incomplete `/etc/fstab` file and without the `root=` kernel command line 33 | option. 34 | * Container managers (such as nspawn and libvirt-lxc) can introspect and set up 35 | file systems contained in GPT disk images automatically and mount them to the 36 | right places, thus allowing booting the same, identical images on bare metal 37 | and in Linux containers. This enables true, natural portability of disk 38 | images between physical machines and Linux containers. 39 | * As a help to administrators and users partition manager tools can show more 40 | descriptive information about partitions tables. 41 | 42 | Note that the OS side of this specification is currently implemented in 43 | [systemd](https://systemd.io/) 211 and newer in the 44 | [systemd-gpt-auto-generator(8)](https://www.freedesktop.org/software/systemd/man/systemd-gpt-auto-generator.html) 45 | generator tool. Note that automatic discovery of the root only works if the 46 | boot loader communicates this information to the OS, by implementing the 47 | [Boot Loader Interface](https://systemd.io/BOOT_LOADER_INTERFACE). 48 | 49 | ## Defined Partition Type UUIDs 50 | 51 | | Name | Partition Type UUID | Allowed File Systems | Explanation | 52 | |------|---------------------|----------------------|-------------| 53 | | _Root Partition (Alpha)_ | `6523f8ae-3eb1-4e2a-a05a-18b695ae656f` `SD_GPT_ROOT_ALPHA` | Any native, optionally in LUKS | On systems with matching architecture, the first partition with this type UUID on the disk containing the active EFI ESP is automatically mounted to the root directory `/`. If the partition is encrypted with LUKS or has dm-verity integrity data (see below), the device mapper file will be named `/dev/mapper/root`. | 54 | | _Root Partition (ARC)_ | `d27f46ed-2919-4cb8-bd25-9531f3c16534` `SD_GPT_ROOT_ARC` | ditto | ditto | 55 | | _Root Partition (32-bit ARM)_ | `69dad710-2ce4-4e3c-b16c-21a1d49abed3` `SD_GPT_ROOT_ARM` | ditto | ditto | 56 | | _Root Partition (64-bit ARM/AArch64)_ | `b921b045-1df0-41c3-af44-4c6f280d3fae` `SD_GPT_ROOT_ARM64` | ditto | ditto | 57 | | _Root Partition (Itanium/IA-64)_ | `993d8d3d-f80e-4225-855a-9daf8ed7ea97` `SD_GPT_ROOT_IA64` | ditto | ditto | 58 | | _Root Partition (LoongArch 64-bit)_ | `77055800-792c-4f94-b39a-98c91b762bb6` `SD_GPT_ROOT_LOONGARCH64` | ditto | ditto | 59 | | _Root Partition (32-bit MIPS BigEndian (mips))_ | `e9434544-6e2c-47cc-bae2-12d6deafb44c` | ditto | ditto | 60 | | _Root Partition (64-bit MIPS BigEndian (mips64))_ | `d113af76-80ef-41b4-bdb6-0cff4d3d4a25` | ditto | ditto | 61 | | _Root Partition (32-bit MIPS LittleEndian (mipsel))_ | `37c58c8a-d913-4156-a25f-48b1b64e07f0` `SD_GPT_ROOT_MIPS_LE` | ditto | ditto | 62 | | _Root Partition (64-bit MIPS LittleEndian (mips64el))_ | `700bda43-7a34-4507-b179-eeb93d7a7ca3` `SD_GPT_ROOT_MIPS64_LE` | ditto | ditto | 63 | | _Root Partition (HPPA/PARISC)_ | `1aacdb3b-5444-4138-bd9e-e5c2239b2346` `SD_GPT_ROOT_PARISC` | ditto | ditto | 64 | | _Root Partition (32-bit PowerPC)_ | `1de3f1ef-fa98-47b5-8dcd-4a860a654d78` `SD_GPT_ROOT_PPC` | ditto | ditto | 65 | | _Root Partition (64-bit PowerPC BigEndian)_ | `912ade1d-a839-4913-8964-a10eee08fbd2` `SD_GPT_ROOT_PPC64` | ditto | ditto | 66 | | _Root Partition (64-bit PowerPC LittleEndian)_ | `c31c45e6-3f39-412e-80fb-4809c4980599` `SD_GPT_ROOT_PPC64_LE` | ditto | ditto | 67 | | _Root Partition (RISC-V 32-bit)_ | `60d5a7fe-8e7d-435c-b714-3dd8162144e1` `SD_GPT_ROOT_RISCV32` | ditto | ditto | 68 | | _Root Partition (RISC-V 64-bit)_ | `72ec70a6-cf74-40e6-bd49-4bda08e8f224` `SD_GPT_ROOT_RISCV64` | ditto | ditto | 69 | | _Root Partition (s390)_ | `08a7acea-624c-4a20-91e8-6e0fa67d23f9` `SD_GPT_ROOT_S390` | ditto | ditto | 70 | | _Root Partition (s390x)_ | `5eead9a9-fe09-4a1e-a1d7-520d00531306` `SD_GPT_ROOT_S390X` | ditto | ditto | 71 | | _Root Partition (TILE-Gx)_ | `c50cdd70-3862-4cc3-90e1-809a8c93ee2c` `SD_GPT_ROOT_TILEGX` | ditto | ditto | 72 | | _Root Partition (x86)_ | `44479540-f297-41b2-9af7-d131d5f0458a` `SD_GPT_ROOT_X86` | ditto | ditto | 73 | | _Root Partition (amd64/x86_64)_ | `4f68bce3-e8cd-4db1-96e7-fbcaf984b709` `SD_GPT_ROOT_X86_64` | ditto | ditto | 74 | | _`/usr/` Partition (Alpha)_ | `e18cf08c-33ec-4c0d-8246-c6c6fb3da024` `SD_GPT_USR_ALPHA` | Any native, optionally in LUKS | Similar semantics to root partition, but just the `/usr/` partition. | 75 | | _`/usr/` Partition (ARC)_ | `7978a683-6316-4922-bbee-38bff5a2fecc` `SD_GPT_USR_ARC` | ditto | ditto | 76 | | _`/usr/` Partition (32-bit ARM)_ | `7d0359a3-02b3-4f0a-865c-654403e70625` `SD_GPT_USR_ARM` | ditto | ditto | 77 | | _`/usr/` Partition (64-bit ARM/AArch64)_ | `b0e01050-ee5f-4390-949a-9101b17104e9` `SD_GPT_USR_ARM64` | ditto | ditto | 78 | | _`/usr/` Partition (Itanium/IA-64)_ | `4301d2a6-4e3b-4b2a-bb94-9e0b2c4225ea` `SD_GPT_USR_IA64` | ditto | ditto | 79 | | _`/usr/` Partition (LoongArch 64-bit)_ | `e611c702-575c-4cbe-9a46-434fa0bf7e3f` `SD_GPT_USR_LOONGARCH64` | ditto | ditto | 80 | | _`/usr/` Partition (32-bit MIPS BigEndian (mips))_ | `773b2abc-2a99-4398-8bf5-03baac40d02b` | ditto | ditto | 81 | | _`/usr/` Partition (64-bit MIPS BigEndian (mips64))_ | `57e13958-7331-4365-8e6e-35eeee17c61b` | ditto | ditto | 82 | | _`/usr/` Partition (32-bit MIPS LittleEndian (mipsel))_ | `0f4868e9-9952-4706-979f-3ed3a473e947` `SD_GPT_USR_MIPS_LE` | ditto | ditto | 83 | | _`/usr/` Partition (64-bit MIPS LittleEndian (mips64el))_ | `c97c1f32-ba06-40b4-9f22-236061b08aa8` `SD_GPT_USR_MIPS64_LE` | ditto | ditto | 84 | | _`/usr/` Partition (HPPA/PARISC)_ | `dc4a4480-6917-4262-a4ec-db9384949f25` `SD_GPT_USR_PARISC` | ditto | ditto | 85 | | _`/usr/` Partition (32-bit PowerPC)_ | `7d14fec5-cc71-415d-9d6c-06bf0b3c3eaf` `SD_GPT_USR_PPC` | ditto | ditto | 86 | | _`/usr/` Partition (64-bit PowerPC BigEndian)_ | `2c9739e2-f068-46b3-9fd0-01c5a9afbcca` `SD_GPT_USR_PPC64` | ditto | ditto | 87 | | _`/usr/` Partition (64-bit PowerPC LittleEndian)_ | `15bb03af-77e7-4d4a-b12b-c0d084f7491c` `SD_GPT_USR_PPC64_LE` | ditto | ditto | 88 | | _`/usr/` Partition (RISC-V 32-bit)_ | `b933fb22-5c3f-4f91-af90-e2bb0fa50702` `SD_GPT_USR_RISCV32` | ditto | ditto | 89 | | _`/usr/` Partition (RISC-V 64-bit)_ | `beaec34b-8442-439b-a40b-984381ed097d` `SD_GPT_USR_RISCV64` | ditto | ditto | 90 | | _`/usr/` Partition (s390)_ | `cd0f869b-d0fb-4ca0-b141-9ea87cc78d66` `SD_GPT_USR_S390` | ditto | ditto | 91 | | _`/usr/` Partition (s390x)_ | `8a4f5770-50aa-4ed3-874a-99b710db6fea` `SD_GPT_USR_S390X` | ditto | ditto | 92 | | _`/usr/` Partition (TILE-Gx)_ | `55497029-c7c1-44cc-aa39-815ed1558630` `SD_GPT_USR_TILEGX` | ditto | ditto | 93 | | _`/usr/` Partition (x86)_ | `75250d76-8cc6-458e-bd66-bd47cc81a812` `SD_GPT_USR_X86` | ditto | ditto | 94 | | _`/usr/` Partition (amd64/x86_64)_ | `8484680c-9521-48c6-9c11-b0720656f69e` `SD_GPT_USR_X86_64` | ditto | ditto | 95 | | _Root Verity Partition (Alpha)_ | `fc56d9e9-e6e5-4c06-be32-e74407ce09a5` `SD_GPT_ROOT_ALPHA_VERITY` | A dm-verity superblock followed by hash data | Contains dm-verity integrity hash data for the matching root partition. If this feature is used the partition UUID of the root partition should be the first 128 bits of the root hash of the dm-verity hash data, and the partition UUID of this dm-verity partition should be the final 128 bits of it, so that the root partition and its Verity partition can be discovered easily, simply by specifying the root hash. | 96 | | _Root Verity Partition (ARC)_ | `24b2d975-0f97-4521-afa1-cd531e421b8d` `SD_GPT_ROOT_ARC_VERITY` | ditto | ditto | 97 | | _Root Verity Partition (32-bit ARM)_ | `7386cdf2-203c-47a9-a498-f2ecce45a2d6` `SD_GPT_ROOT_ARM_VERITY` | ditto | ditto | 98 | | _Root Verity Partition (64-bit ARM/AArch64)_ | `df3300ce-d69f-4c92-978c-9bfb0f38d820` `SD_GPT_ROOT_ARM64_VERITY` | ditto | ditto | 99 | | _Root Verity Partition (Itanium/IA-64)_ | `86ed10d5-b607-45bb-8957-d350f23d0571` `SD_GPT_ROOT_IA64_VERITY` | ditto | ditto | 100 | | _Root Verity Partition (LoongArch 64-bit)_ | `f3393b22-e9af-4613-a948-9d3bfbd0c535` `SD_GPT_ROOT_LOONGARCH64_VERITY` | ditto | ditto | 101 | | _Root Verity Partition (32-bit MIPS BigEndian (mips))_ | `7a430799-f711-4c7e-8e5b-1d685bd48607` | ditto | ditto | 102 | | _Root Verity Partition (64-bit MIPS BigEndian (mips64))_ | `579536f8-6a33-4055-a95a-df2d5e2c42a8` | ditto | ditto | 103 | | _Root Verity Partition (32-bit MIPS LittleEndian (mipsel))_ | `d7d150d2-2a04-4a33-8f12-16651205ff7b` `SD_GPT_ROOT_MIPS_LE_VERITY` | ditto | ditto | 104 | | _Root Verity Partition (64-bit MIPS LittleEndian (mips64el))_ | `16b417f8-3e06-4f57-8dd2-9b5232f41aa6` `SD_GPT_ROOT_MIPS64_LE_VERITY` | ditto | ditto | 105 | | _Root Verity Partition (HPPA/PARISC)_ | `d212a430-fbc5-49f9-a983-a7feef2b8d0e` `SD_GPT_ROOT_PARISC_VERITY` | ditto | ditto | 106 | | _Root Verity Partition (64-bit PowerPC LittleEndian)_ | `906bd944-4589-4aae-a4e4-dd983917446a` `SD_GPT_ROOT_PPC64_LE_VERITY` | ditto | ditto | 107 | | _Root Verity Partition (64-bit PowerPC BigEndian)_ | `9225a9a3-3c19-4d89-b4f6-eeff88f17631` `SD_GPT_ROOT_PPC64_VERITY` | ditto | ditto | 108 | | _Root Verity Partition (32-bit PowerPC)_ | `98cfe649-1588-46dc-b2f0-add147424925` `SD_GPT_ROOT_PPC_VERITY` | ditto | ditto | 109 | | _Root Verity Partition (RISC-V 32-bit)_ | `ae0253be-1167-4007-ac68-43926c14c5de` `SD_GPT_ROOT_RISCV32_VERITY` | ditto | ditto | 110 | | _Root Verity Partition (RISC-V 64-bit)_ | `b6ed5582-440b-4209-b8da-5ff7c419ea3d` `SD_GPT_ROOT_RISCV64_VERITY` | ditto | ditto | 111 | | _Root Verity Partition (s390)_ | `7ac63b47-b25c-463b-8df8-b4a94e6c90e1` `SD_GPT_ROOT_S390_VERITY` | ditto | ditto | 112 | | _Root Verity Partition (s390x)_ | `b325bfbe-c7be-4ab8-8357-139e652d2f6b` `SD_GPT_ROOT_S390X_VERITY` | ditto | ditto | 113 | | _Root Verity Partition (TILE-Gx)_ | `966061ec-28e4-4b2e-b4a5-1f0a825a1d84` `SD_GPT_ROOT_TILEGX_VERITY` | ditto | ditto | 114 | | _Root Verity Partition (amd64/x86_64)_ | `2c7357ed-ebd2-46d9-aec1-23d437ec2bf5` `SD_GPT_ROOT_X86_64_VERITY` | ditto | ditto | 115 | | _Root Verity Partition (x86)_ | `d13c5d3b-b5d1-422a-b29f-9454fdc89d76` `SD_GPT_ROOT_X86_VERITY` | ditto | ditto | 116 | | _`/usr/` Verity Partition (Alpha)_ | `8cce0d25-c0d0-4a44-bd87-46331bf1df67` `SD_GPT_USR_ALPHA_VERITY` | A dm-verity superblock followed by hash data | Similar semantics to root Verity partition, but just for the `/usr/` partition. | 117 | | _`/usr/` Verity Partition (ARC)_ | `fca0598c-d880-4591-8c16-4eda05c7347c` `SD_GPT_USR_ARC_VERITY` | ditto | ditto | 118 | | _`/usr/` Verity Partition (32-bit ARM)_ | `c215d751-7bcd-4649-be90-6627490a4c05` `SD_GPT_USR_ARM_VERITY` | ditto | ditto | 119 | | _`/usr/` Verity Partition (64-bit ARM/AArch64)_ | `6e11a4e7-fbca-4ded-b9e9-e1a512bb664e` `SD_GPT_USR_ARM64_VERITY` | ditto | ditto | 120 | | _`/usr/` Verity Partition (Itanium/IA-64)_ | `6a491e03-3be7-4545-8e38-83320e0ea880` `SD_GPT_USR_IA64_VERITY` | ditto | ditto | 121 | | _`/usr/` Verity Partition (LoongArch 64-bit)_ | `f46b2c26-59ae-48f0-9106-c50ed47f673d` `SD_GPT_USR_LOONGARCH64_VERITY` | ditto | ditto | 122 | | _`/usr/` Verity Partition (32-bit MIPS BigEndian (mips))_ | `6e5a1bc8-d223-49b7-bca8-37a5fcceb996` | ditto | ditto | 123 | | _`/usr/` Verity Partition (64-bit MIPS BigEndian (mips64))_ | `81cf9d90-7458-4df4-8dcf-c8a3a404f09b` | ditto | ditto | 124 | | _`/usr/` Verity Partition (32-bit MIPS LittleEndian (mipsel))_ | `46b98d8d-b55c-4e8f-aab3-37fca7f80752` `SD_GPT_USR_MIPS_LE_VERITY` | ditto | ditto | 125 | | _`/usr/` Verity Partition (64-bit MIPS LittleEndian (mips64el))_ | `3c3d61fe-b5f3-414d-bb71-8739a694a4ef` `SD_GPT_USR_MIPS64_LE_VERITY` | ditto | ditto | 126 | | _`/usr/` Verity Partition (HPPA/PARISC)_ | `5843d618-ec37-48d7-9f12-cea8e08768b2` `SD_GPT_USR_PARISC_VERITY` | ditto | ditto | 127 | | _`/usr/` Verity Partition (64-bit PowerPC LittleEndian)_ | `ee2b9983-21e8-4153-86d9-b6901a54d1ce` `SD_GPT_USR_PPC64_LE_VERITY` | ditto | ditto | 128 | | _`/usr/` Verity Partition (64-bit PowerPC BigEndian)_ | `bdb528a5-a259-475f-a87d-da53fa736a07` `SD_GPT_USR_PPC64_VERITY` | ditto | ditto | 129 | | _`/usr/` Verity Partition (32-bit PowerPC)_ | `df765d00-270e-49e5-bc75-f47bb2118b09` `SD_GPT_USR_PPC_VERITY` | ditto | ditto | 130 | | _`/usr/` Verity Partition (RISC-V 32-bit)_ | `cb1ee4e3-8cd0-4136-a0a4-aa61a32e8730` `SD_GPT_USR_RISCV32_VERITY` | ditto | ditto | 131 | | _`/usr/` Verity Partition (RISC-V 64-bit)_ | `8f1056be-9b05-47c4-81d6-be53128e5b54` `SD_GPT_USR_RISCV64_VERITY` | ditto | ditto | 132 | | _`/usr/` Verity Partition (s390)_ | `b663c618-e7bc-4d6d-90aa-11b756bb1797` `SD_GPT_USR_S390_VERITY` | ditto | ditto | 133 | | _`/usr/` Verity Partition (s390x)_ | `31741cc4-1a2a-4111-a581-e00b447d2d06` `SD_GPT_USR_S390X_VERITY` | ditto | ditto | 134 | | _`/usr/` Verity Partition (TILE-Gx)_ | `2fb4bf56-07fa-42da-8132-6b139f2026ae` `SD_GPT_USR_TILEGX_VERITY` | ditto | ditto | 135 | | _`/usr/` Verity Partition (amd64/x86_64)_ | `77ff5f63-e7b6-4633-acf4-1565b864c0e6` `SD_GPT_USR_X86_64_VERITY` | ditto | ditto | 136 | | _`/usr/` Verity Partition (x86)_ | `8f461b0d-14ee-4e81-9aa9-049b6fb97abd` `SD_GPT_USR_X86_VERITY` | ditto | ditto | 137 | | _Root Verity Signature Partition (Alpha)_ | `d46495b7-a053-414f-80f7-700c99921ef8` `SD_GPT_ROOT_ALPHA_VERITY_SIG` | A serialized JSON object, see below | Contains a root hash and a PKCS#7 signature for it, permitting signed dm-verity GPT images. | 138 | | _Root Verity Signature Partition (ARC)_ | `143a70ba-cbd3-4f06-919f-6c05683a78bc` `SD_GPT_ROOT_ARC_VERITY_SIG` | ditto | ditto | 139 | | _Root Verity Signature Partition (32-bit ARM)_ | `42b0455f-eb11-491d-98d3-56145ba9d037` `SD_GPT_ROOT_ARM_VERITY_SIG` | ditto | ditto | 140 | | _Root Verity Signature Partition (64-bit ARM/AArch64)_ | `6db69de6-29f4-4758-a7a5-962190f00ce3` `SD_GPT_ROOT_ARM64_VERITY_SIG` | ditto | ditto | 141 | | _Root Verity Signature Partition (Itanium/IA-64)_ | `e98b36ee-32ba-4882-9b12-0ce14655f46a` `SD_GPT_ROOT_IA64_VERITY_SIG` | ditto | ditto | 142 | | _Root Verity Signature Partition (LoongArch 64-bit)_ | `5afb67eb-ecc8-4f85-ae8e-ac1e7c50e7d0` `SD_GPT_ROOT_LOONGARCH64_VERITY_SIG` | ditto | ditto | 143 | | _Root Verity Signature Partition (32-bit MIPS BigEndian (mips))_ | `bba210a2-9c5d-45ee-9e87-ff2ccbd002d0` | ditto | ditto | 144 | | _Root Verity Signature Partition (64-bit MIPS BigEndian (mips64))_ | `43ce94d4-0f3d-4999-8250-b9deafd98e6e` | ditto | ditto | 145 | | _Root Verity Signature Partition (32-bit MIPS LittleEndian (mipsel))_ | `c919cc1f-4456-4eff-918c-f75e94525ca5` `SD_GPT_ROOT_MIPS_LE_VERITY_SIG` | ditto | ditto | 146 | | _Root Verity Signature Partition (64-bit MIPS LittleEndian (mips64el))_ | `904e58ef-5c65-4a31-9c57-6af5fc7c5de7` `SD_GPT_ROOT_MIPS64_LE_VERITY_SIG` | ditto | ditto | 147 | | _Root Verity Signature Partition (HPPA/PARISC)_ | `15de6170-65d3-431c-916e-b0dcd8393f25` `SD_GPT_ROOT_PARISC_VERITY_SIG` | ditto | ditto | 148 | | _Root Verity Signature Partition (64-bit PowerPC LittleEndian)_ | `d4a236e7-e873-4c07-bf1d-bf6cf7f1c3c6` `SD_GPT_ROOT_PPC64_LE_VERITY_SIG` | ditto | ditto | 149 | | _Root Verity Signature Partition (64-bit PowerPC BigEndian)_ | `f5e2c20c-45b2-4ffa-bce9-2a60737e1aaf` `SD_GPT_ROOT_PPC64_VERITY_SIG` | ditto | ditto | 150 | | _Root Verity Signature Partition (32-bit PowerPC)_ | `1b31b5aa-add9-463a-b2ed-bd467fc857e7` `SD_GPT_ROOT_PPC_VERITY_SIG` | ditto | ditto | 151 | | _Root Verity Signature Partition (RISC-V 32-bit)_ | `3a112a75-8729-4380-b4cf-764d79934448` `SD_GPT_ROOT_RISCV32_VERITY_SIG` | ditto | ditto | 152 | | _Root Verity Signature Partition (RISC-V 64-bit)_ | `efe0f087-ea8d-4469-821a-4c2a96a8386a` `SD_GPT_ROOT_RISCV64_VERITY_SIG` | ditto | ditto | 153 | | _Root Verity Signature Partition (s390)_ | `3482388e-4254-435a-a241-766a065f9960` `SD_GPT_ROOT_S390_VERITY_SIG` | ditto | ditto | 154 | | _Root Verity Signature Partition (s390x)_ | `c80187a5-73a3-491a-901a-017c3fa953e9` `SD_GPT_ROOT_S390X_VERITY_SIG` | ditto | ditto | 155 | | _Root Verity Signature Partition (TILE-Gx)_ | `b3671439-97b0-4a53-90f7-2d5a8f3ad47b` `SD_GPT_ROOT_TILEGX_VERITY_SIG` | ditto | ditto | 156 | | _Root Verity Signature Partition (amd64/x86_64)_ | `41092b05-9fc8-4523-994f-2def0408b176` `SD_GPT_ROOT_X86_64_VERITY_SIG` | ditto | ditto | 157 | | _Root Verity Signature Partition (x86)_ | `5996fc05-109c-48de-808b-23fa0830b676` `SD_GPT_ROOT_X86_VERITY_SIG` | ditto | ditto | 158 | | _`/usr/` Verity Signature Partition (Alpha)_ | `5c6e1c76-076a-457a-a0fe-f3b4cd21ce6e` `SD_GPT_USR_ALPHA_VERITY_SIG` | A serialized JSON object, see below | Similar semantics to root Verity signature partition, but just for the `/usr/` partition. | 159 | | _`/usr/` Verity Signature Partition (ARC)_ | `94f9a9a1-9971-427a-a400-50cb297f0f35` `SD_GPT_USR_ARC_VERITY_SIG` | ditto | ditto | 160 | | _`/usr/` Verity Signature Partition (32-bit ARM)_ | `d7ff812f-37d1-4902-a810-d76ba57b975a` `SD_GPT_USR_ARM_VERITY_SIG` | ditto | ditto | 161 | | _`/usr/` Verity Signature Partition (64-bit ARM/AArch64)_ | `c23ce4ff-44bd-4b00-b2d4-b41b3419e02a` `SD_GPT_USR_ARM64_VERITY_SIG` | ditto | ditto | 162 | | _`/usr/` Verity Signature Partition (Itanium/IA-64)_ | `8de58bc2-2a43-460d-b14e-a76e4a17b47f` `SD_GPT_USR_IA64_VERITY_SIG` | ditto | ditto | 163 | | _`/usr/` Verity Signature Partition (LoongArch 64-bit)_ | `b024f315-d330-444c-8461-44bbde524e99` `SD_GPT_USR_LOONGARCH64_VERITY_SIG` | ditto | ditto | 164 | | _`/usr/` Verity Signature Partition (32-bit MIPS BigEndian (mips))_ | `97ae158d-f216-497b-8057-f7f905770f54` | ditto | ditto | 165 | | _`/usr/` Verity Signature Partition (64-bit MIPS BigEndian (mips64))_ | `05816ce2-dd40-4ac6-a61d-37d32dc1ba7d` | ditto | ditto | 166 | | _`/usr/` Verity Signature Partition (32-bit MIPS LittleEndian (mipsel))_ | `3e23ca0b-a4bc-4b4e-8087-5ab6a26aa8a9` `SD_GPT_USR_MIPS_LE_VERITY_SIG` | ditto | ditto | 167 | | _`/usr/` Verity Signature Partition (64-bit MIPS LittleEndian (mips64el))_ | `f2c2c7ee-adcc-4351-b5c6-ee9816b66e16` `SD_GPT_USR_MIPS64_LE_VERITY_SIG` | ditto | ditto | 168 | | _`/usr/` Verity Signature Partition (HPPA/PARISC)_ | `450dd7d1-3224-45ec-9cf2-a43a346d71ee` `SD_GPT_USR_PARISC_VERITY_SIG` | ditto | ditto | 169 | | _`/usr/` Verity Signature Partition (64-bit PowerPC LittleEndian)_ | `c8bfbd1e-268e-4521-8bba-bf314c399557` `SD_GPT_USR_PPC64_LE_VERITY_SIG` | ditto | ditto | 170 | | _`/usr/` Verity Signature Partition (64-bit PowerPC BigEndian)_ | `0b888863-d7f8-4d9e-9766-239fce4d58af` `SD_GPT_USR_PPC64_VERITY_SIG` | ditto | ditto | 171 | | _`/usr/` Verity Signature Partition (32-bit PowerPC)_ | `7007891d-d371-4a80-86a4-5cb875b9302e` `SD_GPT_USR_PPC_VERITY_SIG` | ditto | ditto | 172 | | _`/usr/` Verity Signature Partition (RISC-V 32-bit)_ | `c3836a13-3137-45ba-b583-b16c50fe5eb4` `SD_GPT_USR_RISCV32_VERITY_SIG` | ditto | ditto | 173 | | _`/usr/` Verity Signature Partition (RISC-V 64-bit)_ | `d2f9000a-7a18-453f-b5cd-4d32f77a7b32` `SD_GPT_USR_RISCV64_VERITY_SIG` | ditto | ditto | 174 | | _`/usr/` Verity Signature Partition (s390)_ | `17440e4f-a8d0-467f-a46e-3912ae6ef2c5` `SD_GPT_USR_S390_VERITY_SIG` | ditto | ditto | 175 | | _`/usr/` Verity Signature Partition (s390x)_ | `3f324816-667b-46ae-86ee-9b0c0c6c11b4` `SD_GPT_USR_S390X_VERITY_SIG` | ditto | ditto | 176 | | _`/usr/` Verity Signature Partition (TILE-Gx)_ | `4ede75e2-6ccc-4cc8-b9c7-70334b087510` `SD_GPT_USR_TILEGX_VERITY_SIG` | ditto | ditto | 177 | | _`/usr/` Verity Signature Partition (amd64/x86_64)_ | `e7bb33fb-06cf-4e81-8273-e543b413e2e2` `SD_GPT_USR_X86_64_VERITY_SIG` | ditto | ditto | 178 | | _`/usr/` Verity Signature Partition (x86)_ | `974a71c0-de41-43c3-be5d-5c5ccd1ad2c0` `SD_GPT_USR_X86_VERITY_SIG` | ditto | ditto | 179 | | _EFI System Partition_ | `c12a7328-f81f-11d2-ba4b-00a0c93ec93b` `SD_GPT_ESP` | VFAT | The ESP used for the current boot is automatically mounted to `/boot/` or `/efi/`, unless a different partition is mounted there (possibly via `/etc/fstab`) or the mount point directory is non-empty on the root disk. If both ESP and XBOOTLDR exist, the `/efi/` mount point shall be used for ESP. This partition type is defined by the [UEFI Specification](http://www.uefi.org/specifications). | 180 | | _Extended Boot Loader Partition_ | `bc13c2ff-59e6-4262-a352-b275fd6f7172` `SD_GPT_XBOOTLDR` | Typically VFAT | The Extended Boot Loader Partition (XBOOTLDR) used for the current boot is automatically mounted to `/boot/`, unless a different partition is mounted there (possibly via `/etc/fstab`) or the mount point directory is non-empty on the root disk. This partition type is defined by the [Boot Loader Specification](https://systemd.io/BOOT_LOADER_SPECIFICATION). | 181 | | _Swap_ | `0657fd6d-a4ab-43c4-84e5-0933c84b4f4f` `SD_GPT_SWAP` | Swap, optionally in LUKS | All swap partitions on the disk containing the root partition are automatically enabled. If the partition is encrypted with LUKS, the device mapper file will be named `/dev/mapper/swap`. This partition type predates the Discoverable Partitions Specification. | 182 | | _Home Partition_ | `933ac7e1-2eb4-4f13-b844-0e14e2aef915` `SD_GPT_HOME` | Any native, optionally in LUKS | The first partition with this type UUID on the disk containing the root partition is automatically mounted to `/home/`. If the partition is encrypted with LUKS, the device mapper file will be named `/dev/mapper/home`. | 183 | | _Server Data Partition_ | `3b8f8425-20e0-4f3b-907f-1a25a76f98e8` `SD_GPT_SRV` | Any native, optionally in LUKS | The first partition with this type UUID on the disk containing the root partition is automatically mounted to `/srv/`. If the partition is encrypted with LUKS, the device mapper file will be named `/dev/mapper/srv`. | 184 | | _Variable Data Partition_ | `4d21b016-b534-45c2-a9fb-5c16e091fd2d` `SD_GPT_VAR` | Any native, optionally in LUKS | The first partition with this type UUID on the disk containing the root partition is automatically mounted to `/var/` — under the condition that its partition UUID matches the first 128 bits of `HMAC-SHA256(machine-id, 0x4d21b016b53445c2a9fb5c16e091fd2d)` (i.e. the SHA256 HMAC hash of the binary type UUID keyed by the machine ID as read from [`/etc/machine-id`](https://www.freedesktop.org/software/systemd/man/machine-id.html). This special requirement is made because `/var/` (unlike the other partition types listed here) is inherently private to a specific installation and cannot possibly be shared between multiple OS installations on the same disk, and thus should be bound to a specific instance of the OS, identified by its machine ID. If the partition is encrypted with LUKS, the device mapper file will be named `/dev/mapper/var`. | 185 | | _Temporary Data Partition_ | `7ec6f557-3bc5-4aca-b293-16ef5df639d1` `SD_GPT_TMP` | Any native, optionally in LUKS | The first partition with this type UUID on the disk containing the root partition is automatically mounted to `/var/tmp/`. If the partition is encrypted with LUKS, the device mapper file will be named `/dev/mapper/tmp`. Note that the intended mount point is indeed `/var/tmp/`, not `/tmp/`. The latter is typically maintained in memory via `tmpfs` and does not require a partition on disk. In some cases it might be desirable to make `/tmp/` persistent too, in which case it is recommended to make it a symlink or bind mount to `/var/tmp/`, thus not requiring its own partition type UUID. | 186 | | _Per-user Home Partition_ | `773f91ef-66d4-49b5-bd83-d683bf40ad16` `SD_GPT_USER_HOME` | Any native, optionally in LUKS | A home partition of a user, managed by [`systemd-homed`](https://www.freedesktop.org/software/systemd/man/systemd-homed.html). | 187 | | _Generic Linux Data Partition_ | `0fc63daf-8483-4772-8e79-3d69d8477de4` `SD_GPT_LINUX_GENERIC` | Any native, optionally in LUKS | No automatic mounting takes place for other Linux data partitions. This partition type should be used for all partitions that carry Linux file systems. The installer needs to mount them explicitly via entries in `/etc/fstab`. Optionally, these partitions may be encrypted with LUKS. This partition type predates the Discoverable Partitions Specification. | 188 | 189 | Other GPT type IDs might be used on Linux, for example to mark software RAID or 190 | LVM partitions. The definitions of those GPT types is outside of the scope of 191 | this specification. 192 | 193 | [systemd-id128(1)](https://www.freedesktop.org/software/systemd/man/systemd-id128.html)'s 194 | `show` command may be used to list those GPT partition type UUIDs. 195 | 196 | ## Partition Names 197 | 198 | For partitions of the types listed above it is recommended to use 199 | human-friendly, descriptive partition names in the GPT partition table, for 200 | example "*Home*", "*Server* *Data*", "*Fedora* *Root*" and similar, possibly 201 | localized. 202 | 203 | For the Root/Verity/Verity signature partitions it might make sense to use a 204 | versioned naming scheme reflecting the OS name and its version, 205 | e.g. "fooOS_2021.4" or similar. 206 | For details about the version format see the 207 | [Version Format Specification](version_format_specification.md). The underscore 208 | character (`_`) must be used to separate the version from the name of the image. 209 | 210 | ## Partition Attribute Flags 211 | 212 | This specification defines three GPT partition attribute flags that may be set 213 | for the partition types defined above: 214 | 215 | 1. For the root, `/usr/`, Verity, Verity signature, home, server data, variable 216 | data, temporary data, swap, and extended boot loader partitions, the 217 | partition flag bit 63 ("*no-auto*", *SD_GPT_FLAG_NO_AUTO*) may be used to 218 | turn off auto-discovery for the specific partition. If set, the partition 219 | will not be automatically mounted or enabled. 220 | 221 | 2. For the root, `/usr/`, Verity, Verity signature home, server data, variable 222 | data, temporary data and extended boot loader partitions, the partition flag 223 | bit 60 ("*read-only*", *SD_GPT_FLAG_READ_ONLY*) may be used to mark a 224 | partition for read-only mounts only. If set, the partition will be mounted 225 | read-only instead of read-write. Note that the variable data partition and 226 | the temporary data partition will generally not be able to serve their 227 | purpose if marked read-only, since by their very definition they are 228 | supposed to be mutable. (The home and server data partitions are generally 229 | assumed to be mutable as well, but the requirement for them is not equally 230 | strong.) Because of that, while the read-only flag is defined and supported, 231 | it's almost never a good idea to actually use it for these partitions. Also 232 | note that Verity and signature partitions are by their semantics always 233 | read-only. The flag is hence of little effect for them, and it is 234 | recommended to set it unconditionally for the Verity and signature partition 235 | types. 236 | 237 | 3. For the root, `/usr/`, home, server data, variable data, temporary data and 238 | extended boot loader partitions, the partition flag bit 59 239 | ("*grow-file-system*", *SD_GPT_FLAG_GROWFS*) may be used to mark a partition 240 | for automatic growing of the contained file system to the size of the 241 | partition when mounted. Tools that automatically mount disk image with a GPT 242 | partition table are suggested to implicitly grow the contained file system 243 | to the partition size they are contained in, if they are found to be 244 | smaller. This flag is without effect on partitions marked "*read-only*". 245 | 246 | Note that the first two flag definitions happen to correspond nicely to the 247 | same ones used by Microsoft Basic Data Partitions. 248 | 249 | All three of these flags generally affect only auto-discovery and automatic 250 | mounting of disk images. If partitions marked with these flags are mounted 251 | using low-level commands like 252 | [mount(8)](https://man7.org/linux/man-pages/man2/mount.8.html) or directly with 253 | [mount(2)](https://man7.org/linux/man-pages/man2/mount.2.html), they typically 254 | have no effect. 255 | 256 | ## Verity 257 | 258 | The Root/`/usr/` partition types and their matching Verity and Verity signature 259 | partitions enable relatively automatic handling of `dm-verity` protected 260 | setups. These types are defined with two modes of operation in mind: 261 | 262 | 1. A trusted Verity root hash is passed in externally, for example is specified 263 | on the kernel command line that is signed along with the kernel image using 264 | SecureBoot PE signing (which in turn is tested against a set of 265 | firmware-provided set of signing keys). If so, discovery and setup of a 266 | Verity volume may be fully automatic: if the root partition's UUID is chosen 267 | to match the first 128 bit of the root hash, and the matching Verity 268 | partition UUIDs is chosen to match the last 128bit of the root hash, then 269 | automatic discovery and match-up of the two partitions is possible, as the 270 | root hash is enough to both find the partitions and then combine them in a 271 | Verity volume. In this mode a Verity signature partition is not used and 272 | unnecessary. 273 | 274 | 2. A Verity signature partition is included on the disk, with a signature to be 275 | tested against a system-provided set of signing keys. The signature 276 | partition primarily contains two fields: the root hash to use, and a PKCS#7 277 | signature of it, using a signature key trusted by the OS. If so, discovery 278 | and setup of a Verity volume may be fully automatic. First, the specified 279 | root hash is validated with the signature and the OS-provided trusted 280 | keys. If the signature checks out the root hash is then used in the same way 281 | as in the first mode of operation described above. 282 | 283 | Both modes of operation may be combined in a single image. This is particularly 284 | useful for images that shall be usable in two different contexts: for example 285 | an image that shall be able to boot directly on UEFI systems (in which 286 | case it makes sense to include the root hash on the kernel command line that is 287 | included in the signed kernel image to boot, as per mode of operation #1 288 | above), but also be able to used as image for a container engine (such as 289 | `systemd-nspawn`), which can use the signature partition to validate the image, 290 | without making use of the signed kernel image (and thus following mode of 291 | operation #2). 292 | 293 | The Verity signature partition's contents should be a serialized JSON object in 294 | text form, padded with NUL bytes to the next multiple of 4096 bytes in 295 | size. Currently three fields are defined for the JSON object: 296 | 297 | 1. The (mandatory) `rootHash` field should be a string containing the Verity root hash, 298 | formatted as series of (lowercase) hex characters. 299 | 300 | 2. The (mandatory) `signature` field should be a string containing the PKCS#7 301 | signature of the root hash, in Base64-encoded DER format. This should be the 302 | same format used by the Linux kernel's dm-verity signature logic, i.e. the 303 | signed data should be the exact string representation of the hash, as stored 304 | in `rootHash` above. 305 | 306 | 3. The (optional) `certificateFingerprint` field should be a string containing 307 | a SHA256 fingerprint of the X.509 certificate in DER format for the key that 308 | signed the root hash, formatted as series of (lowercase) hex characters (no `:` 309 | separators or such). 310 | 311 | More fields might be added in later revisions of this specification. 312 | 313 | ## Suggested Mode of Operation 314 | 315 | An *installer* that repartitions the hard disk _should_ use the above UUID 316 | partition types for appropriate partitions it creates. 317 | 318 | An *installer* which supports a "manual partitioning" interface _may_ choose to 319 | pre-populate the interface with swap, `/home/`, `/srv/`, `/var/tmp/` partitions 320 | of pre-existing Linux installations, identified with the GPT type UUIDs 321 | above. The installer should not pre-populate such an interface with any 322 | identified root, `/usr` or `/var/` partition unless the intention is to 323 | overwrite an existing operating system that might be installed. 324 | 325 | An *installer* _may_ omit creating entries in `/etc/fstab` for root, `/home/`, 326 | `/srv/`, `/var/`, `/var/tmp` and for the swap partitions if they use these UUID 327 | partition types, and are the first partitions on the disk of each type. If the 328 | ESP shall be mounted to `/efi/` (or `/boot/`), it may additionally omit 329 | creating the entry for it in `/etc/fstab`. If the EFI partition shall not be 330 | mounted to `/efi/` or `/boot/`, it _must_ create `/etc/fstab` entries for them. 331 | If other partitions are used (for example for `/usr/local/` or 332 | `/var/lib/mysql/`), the installer _must_ register these in `/etc/fstab`. The 333 | `root=` parameter passed to the kernel by the boot loader may be omitted if the 334 | root partition is the first one on the disk of its type. If the root partition 335 | is not the first one on the disk, the `root=` parameter _must_ be passed to the 336 | kernel by the boot loader. An installer that mounts a root, `/usr/`, `/home/`, 337 | `/srv/`, `/var/`, or `/var/tmp/` file system with the partition types defined 338 | as above which contains a LUKS header _must_ call the device mapper device 339 | "root", "usr", "home", "srv", "var" or "tmp", respectively. This is necessary 340 | to ensure that the automatic discovery will never result in different device 341 | mapper names than any static configuration by the installer, thus eliminating 342 | possible naming conflicts and ambiguities. 343 | 344 | An *operating* *system* _should_ automatically discover and mount the first 345 | root partition that does not have the no-auto flag set (as described above) by 346 | scanning the disk containing the currently used EFI ESP. It _should_ 347 | automatically discover and mount the first `/usr/`, `/home/`, `/srv/`, `/var/`, 348 | `/var/tmp/` and swap partitions that do not have the no-auto flag set by 349 | scanning the disk containing the discovered root partition. It should 350 | automatically discover and mount the partition containing the currently used 351 | EFI ESP to `/efi/` (or `/boot/` as fallback). It should automatically discover 352 | and mount the partition containing the currently used Extended Boot Loader 353 | Partition to `/boot/`. It _should not_ discover or automatically mount 354 | partitions with other UUID partition types, or partitions located on other 355 | disks, or partitions with the no-auto flag set. User configuration shall 356 | always override automatic discovery and mounting. If a root, `/usr/`, 357 | `/home/`, `/srv/`, `/boot/`, `/var/`, `/var/tmp/`, `/efi/`, `/boot/` or swap 358 | partition is listed in `/etc/fstab` or with `root=` on the kernel command line, 359 | it _must_ take precedence over automatically discovered partitions. If a 360 | `/home/`, `/usr/`, `/srv/`, `/boot/`, `/var/`, `/var/tmp/`, `/efi/` or `/boot/` 361 | directory is found to be populated already in the root partition, the automatic 362 | discovery _must not_ mount any discovered file system over it. Optionally, in 363 | case of the root, `/usr/` and their Verity partitions instead of strictly 364 | mounting the first suitable partition an OS might choose to mount the partition 365 | whose label compares the highest according to `strverscmp()` or similar logic, 366 | in order to implement a simple partition-based A/B versioning scheme. The 367 | precise rules are left for the implementation to decide, but when in doubt 368 | earlier partitions (by their index) should always win over later partitions if 369 | the label comparison is inconclusive. 370 | 371 | A *container* *manager* should automatically discover and mount the root, 372 | `/usr/`, `/home/`, `/srv/`, `/var/`, `/var/tmp/` partitions inside a container 373 | disk image. It may choose to mount any discovered ESP and/or XBOOTLDR 374 | partition to `/efi/` or `/boot/`. It should ignore any swap should they be 375 | included in a container disk image. 376 | 377 | If a btrfs file system is automatically discovered and mounted by the operating 378 | system/container manager it will be mounted with its *default* subvolume. The 379 | installer should make sure to set the default subvolume correctly using "btrfs 380 | subvolume set-default". 381 | 382 | ## Sharing of File Systems between Installations 383 | 384 | If two Linux-based operating systems are installed on the same disk, the scheme 385 | above suggests that they may share the swap, `/home/`, `/srv/`, `/var/tmp/`, 386 | ESP, XBOOTLDR. However, they should each have their own root, `/usr/` and 387 | `/var/` partition. 388 | 389 | ## Frequently Asked Questions 390 | 391 | ### Why are you taking my `/etc/fstab` away? 392 | 393 | We are not. `/etc/fstab` always overrides automatic discovery and is indeed 394 | mentioned in the specifications. We are simply trying to make the boot and 395 | installation processes of Linux a bit more robust and self-descriptive. 396 | 397 | ### Why did you only define the root partition for these listed architectures? 398 | 399 | Please submit a patch that adds appropriate partition type UUIDs for the 400 | architecture of your choice should they be missing so far. The only reason they 401 | aren't defined yet is that nobody submitted them yet. 402 | 403 | ### Why define distinct root partition UUIDs for the various architectures? 404 | 405 | This allows disk images that may be booted on multiple architectures to use 406 | discovery of the appropriate root partition on each architecture. 407 | 408 | ### Doesn't this break multi-boot scenarios? 409 | 410 | No, it doesn't. The specification says that installers may not stop creating 411 | `/etc/fstab` or stop including `root=` on the kernel command line, unless the used 412 | partitions are the first ones of their type on the disk. Additionally, 413 | `/etc/fstab` and `root=` both override automatic discovery. Multi-boot is hence 414 | well supported, since it doesn't change anything for anything but the first 415 | installation. 416 | 417 | That all said, it's not expected that generic installers generally stop setting 418 | `root=` and creating `/etc/fstab` anyway. The option to drop these configuration 419 | bits is primarily something for appliance-like devices. However, generic 420 | installers should *still* set the right GPT partition types for the partitions 421 | they create so that container managers, partition tools and administrators can 422 | benefit. Phrased differently, this specification introduces A) the 423 | *recommendation* to use the newly defined partition types to tag things 424 | properly and B) the *option* to then drop `root=` and `/etc/fstab`. While we 425 | advertise A) to *all* installers, we only propose B) for simpler, 426 | appliance-like installations. 427 | 428 | ### What partitioning tools will create a DPS-compliant partition table? 429 | 430 | As of util-linux 2.25.2, the `fdisk` tool provides type codes to create the 431 | root, home, and swap partitions that the DPS expects. By default, `fdisk` will 432 | create an old-style MBR, not a GPT, so typing `l` to list partition types will 433 | not show the choices to let you set the correct UUID. Make sure to first create 434 | an empty GPT, then type `l` in order for the DPS-compliant type codes to be 435 | available. 436 | 437 | The `gdisk` tool (from version 1.0.5 onward) and its variants (`sgdisk`, 438 | `cgdisk`) also support creation of partitions with a matching type code. 439 | 440 | ## Links 441 | 442 | [Boot Loader Specification](boot_loader_specification.md)
443 | [Boot Loader Interface](https://systemd.io/BOOT_LOADER_INTERFACE)
444 | [Safely Building Images](https://systemd.io/BUILDING_IMAGES)
445 | [`systemd-boot(7)`](https://www.freedesktop.org/software/systemd/man/systemd-boot.html)
446 | [`bootctl(1)`](https://www.freedesktop.org/software/systemd/man/bootctl.html)
447 | [`systemd-gpt-auto-generator(8)`](https://www.freedesktop.org/software/systemd/man/systemd-gpt-auto-generator.html) 448 | --------------------------------------------------------------------------------