├── .github
└── workflows
│ ├── build-kernel-debs-7.1.yml
│ ├── build-kernel-debs.yml
│ ├── docker-publish.yml
│ └── docker-test.yml
├── .gitignore
├── Dockerfile
├── README.md
├── build
└── proxmox
│ ├── README.md
│ ├── build.sh
│ ├── build7.1-10.sh
│ ├── build7.sh
│ ├── build_latest.sh
│ └── docker-compose.yaml
├── deep-dive.md
└── patches
├── add-relaxable-rmrr-5_11.patch
├── add-relaxable-rmrr-5_13.patch
├── add-relaxable-rmrr-5_15.patch
├── add-relaxable-rmrr-5_8_and_up.patch
├── add-relaxable-rmrr-below-5_8.patch
├── proxmox.patch
├── proxmox7.patch
└── relaxable-rmrr-patch-sed.txt
/.github/workflows/build-kernel-debs-7.1.yml:
--------------------------------------------------------------------------------
1 | name: Build kernel debs v7.1
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | # * is a special character in YAML so you have to quote this string
7 | - cron: '0 4 * * 6'
8 | jobs:
9 | build-kernel-debs:
10 | runs-on: [pve-kernel]
11 | container:
12 | image: aterfax/relaxable-rmrr-proxmox-kernel-builder:latest
13 | options: -v ${{ github.workspace }}:/build/proxmox/proxmox-kernel #Note this is technically a very bad idea if your Runner is doing more than this sole action due to environment pollution.
14 |
15 | steps:
16 |
17 | - name: Pre-clean up debs if present
18 | run: bash -c 'if [[ -d "/build/proxmox/proxmox-kernel/debs" ]]; then rm -rf /build/proxmox/proxmox-kernel/debs; fi'
19 |
20 | - name: Build kernel
21 | run: cd /build/proxmox/ && ./build7.1-10.sh
22 |
23 | - name: Zip up debs
24 | run: zip -r release.zip /build/proxmox/proxmox-kernel/debs
25 |
26 | - name: Archive the generated debs
27 | uses: actions/upload-artifact@v3
28 | with:
29 | name: RMRR-Relaxation-Patched-PVE-kernel-debs-zip
30 | path: release.zip
31 |
32 | - name: Calculate release zip checksum
33 | run: bash -c 'sha256sum release.zip && md5sum release.zip'
34 |
35 | - name: Clean up release zip
36 | run: rm release.zip
37 |
38 | - name: Clean up debs if present
39 | run: bash -c 'if [[ -d "debs" ]]; then rm -rf debs; fi'
40 |
--------------------------------------------------------------------------------
/.github/workflows/build-kernel-debs.yml:
--------------------------------------------------------------------------------
1 | name: Build kernel debs
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | # * is a special character in YAML so you have to quote this string
7 | - cron: '0 2 * * 6'
8 | jobs:
9 | build-kernel-debs:
10 | runs-on: [pve-kernel]
11 | container:
12 | image: aterfax/relaxable-rmrr-proxmox-kernel-builder:latest
13 | options: -v ${{ github.workspace }}:/build/proxmox/proxmox-kernel #Note this is technically a very bad idea if your Runner is doing more than this sole action due to environment pollution.
14 |
15 | steps:
16 |
17 | - name: Pre-clean up debs if present
18 | run: bash -c 'if [[ -d "/build/proxmox/proxmox-kernel/debs" ]]; then rm -rf /build/proxmox/proxmox-kernel/debs; fi'
19 |
20 | - name: Build kernel
21 | run: cd /build/proxmox/ && ./build_latest.sh
22 |
23 | - name: Zip up debs
24 | run: zip -r release.zip /build/proxmox/proxmox-kernel/debs
25 |
26 | - name: Archive the generated debs
27 | uses: actions/upload-artifact@v3
28 | with:
29 | name: RMRR-Relaxation-Patched-PVE-kernel-debs-zip
30 | path: release.zip
31 |
32 | - name: Calculate release zip checksum
33 | run: bash -c 'sha256sum release.zip && md5sum release.zip'
34 |
35 | - name: Clean up release zip
36 | run: rm release.zip
37 |
38 | - name: Clean up debs if present
39 | run: bash -c 'if [[ -d "debs" ]]; then rm -rf debs; fi'
40 |
--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
1 | name: Build and Publish Docker Image
2 |
3 | on:
4 | push:
5 | branches: [master]
6 | paths:
7 | - 'Dockerfile'
8 | workflow_dispatch:
9 |
10 | jobs:
11 | build-and-push-docker-image:
12 | name: Build Docker image and push to repositories
13 | runs-on: ubuntu-latest
14 | steps:
15 |
16 | - name: Checkout code
17 | uses: actions/checkout@v2
18 |
19 | - name: Set up Docker Buildx
20 | id: buildx
21 | uses: docker/setup-buildx-action@v1
22 |
23 | - name: Login to DockerHub
24 | uses: docker/login-action@v1
25 | with:
26 | username: ${{ secrets.DOCKERHUB_USERNAME }}
27 | password: ${{ secrets.DOCKERHUB_TOKEN }}
28 |
29 | - name: Login to Github Packages
30 | uses: docker/login-action@v1
31 | with:
32 | registry: ghcr.io
33 | username: ${{ github.actor }}
34 | password: ${{ secrets.GHCR_PAT }}
35 |
36 | - name: Build image and push to Docker Hub and GitHub Container Registry
37 | uses: docker/build-push-action@v2
38 | with:
39 | context: .
40 | push: true
41 | tags: |
42 | aterfax/relaxable-rmrr-proxmox-kernel-builder:latest
43 | ghcr.io/aterfax/relaxable-rmrr-proxmox-kernel-builder:latest
44 |
45 | - name: Image digest
46 | run: echo ${{ steps.docker_build.outputs.digest }}
47 |
--------------------------------------------------------------------------------
/.github/workflows/docker-test.yml:
--------------------------------------------------------------------------------
1 | name: Build Docker Image
2 |
3 | on:
4 | # run it during pull request
5 | pull_request:
6 | paths:
7 | - 'Dockerfile'
8 |
9 | workflow_dispatch:
10 |
11 | jobs:
12 | # define job to build and publish docker image
13 | build-docker-image:
14 | name: Build Docker image only
15 | # run only when code is compiling and tests are passing
16 | runs-on: ubuntu-latest
17 |
18 | # steps to perform in job
19 | steps:
20 | - name: Checkout code
21 | uses: actions/checkout@v2
22 |
23 | # setup Docker build action
24 | - name: Set up Docker Buildx
25 | id: buildx
26 | uses: docker/setup-buildx-action@v1
27 |
28 | - name: Build image only
29 | uses: docker/build-push-action@v2
30 | with:
31 | context: .
32 | push: false
33 |
34 | - name: Image digest
35 | run: echo ${{ steps.docker_build.outputs.digest }}
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | proxmox-kernel/
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # vim:set ft=dockerfile:
2 |
3 | # This Dockerfile builds the newest kernel with RMRR patch
4 | #
5 | # TODO Add support for custom branch of build
6 | FROM debian:bullseye
7 |
8 | RUN mkdir -p /build
9 | WORKDIR /build
10 |
11 | RUN set -x \
12 | && apt update && apt install -y ca-certificates wget
13 |
14 | # apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 7BF2812E8A6E88E0
15 | RUN apt -y install gnupg && wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg && \
16 | echo 'deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription' > /etc/apt/sources.list.d/pve.list
17 |
18 | RUN apt -y update
19 |
20 | RUN apt -y install git nano screen patch fakeroot build-essential devscripts libncurses5 libncurses5-dev libssl-dev bc \
21 | flex bison libelf-dev libaudit-dev libgtk2.0-dev libperl-dev asciidoc xmlto gnupg gnupg2 rsync lintian debhelper \
22 | libdw-dev libnuma-dev libslang2-dev sphinx-common asciidoc-base automake cpio dh-python file gcc kmod libiberty-dev \
23 | libpve-common-perl libtool perl-modules python3-minimal python3-dev sed tar zlib1g-dev lz4 curl zstd dwarves
24 |
25 | #Need pahole 1.16 or above
26 | RUN TEMP_DEB="$(mktemp)" && \
27 | wget -O "$TEMP_DEB" http://archive.ubuntu.com/ubuntu/pool/universe/d/dwarves-dfsg/dwarves_1.21-0ubuntu1~20.04.1_amd64.deb && \
28 | dpkg -i "$TEMP_DEB" && \
29 | rm -f "$TEMP_DEB"
30 |
31 | # Copy both folders into docker root filepath.
32 | COPY build /build
33 | COPY patches /patches
34 |
35 | #ENTRYPOINT ["tail", "-f", "/dev/null"]
36 | ENTRYPOINT bash -c "cd /build/proxmox/ && ./build_latest.sh"
37 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🍻 Relaxed RMRR Mapping for Linux 3.17+ - ARCHIVED
2 | [](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/docker-publish.yml)
3 | [](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/build-kernel-debs.yml)
4 | 
5 | 
6 | > :warning: Note - this repo is now archived as support for the Relaxed RMRR Mapping is now natively supported by the normal Proxmox kernel as of kernel release 6.2.16-13-pve See: https://bugzilla.proxmox.com/show_bug.cgi?id=4707 https://forum.proxmox.com/threads/updating-upgrading-custom-patched-kernel.129384/#post-591947
7 |
8 | This fork has been amended to patch the required iommu source files using ``sed`` rather than ``patch``. This is achieved by using ``sed`` to amend the pve-kernel **Makefile** using several further ``sed`` commands to edit the iommu source file during the make process as this make process pulls the source files (chicken/egg problem.)
9 |
10 | The key ``sed`` commands can be found at:
11 |
12 | - [relax-intel-rmrr/patches/relaxable-rmrr-patch-sed.txt](patches/relaxable-rmrr-patch-sed.txt)
13 | - [relax-intel-rmrr/build/proxmox/build.sh#L157](build/proxmox/build.sh#L157)
14 |
15 |
16 | ## 🐧💨 Now you can use PCI passthrough on broken platforms
17 |
18 | ### TL;DR
19 | When you try to use PCI/PCIe passthrough in KVM/QEMU/Proxmox you get:
20 | ```
21 | vfio-pci 0000:01:00.1: Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.
22 | ```
23 | followed by `vfio: failed to set iommu for container: Operation not permitted`.
24 |
25 | This kernel patch fixes the problem **on kernels v3.17 and up** (tested up to 5.9.1). You can skip to "[Installation](README.md#installation)"
26 | section if you don't care about the rest. Reading of "[Disclaimers](README.md#disclaimers)" section to understand the
27 | risks, and "[Solutions & hacks](deep-dive.md#other-solutions--hacks)" to get the idea of different alternatives is
28 | highly recommended.
29 |
30 | ---
31 |
32 | ### Table of Contents
33 | 1. [Installation](README.md#installation)
34 | - [Proxmox - premade packages](README.md#proxmox---premade-packages-easy)
35 | - [Docker - building from sources](README.md#docker---build-packages-from-sources-intermediate)
36 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources-advanced)
37 | - [Other distros](README.md#other-distros)
38 | 2. [Configuration](README.md#configuration)
39 | 3. [Deep Dive](deep-dive.md) - *a throughout research on the problem written for mortals*
40 | - [Technical details](deep-dive.md#technical-details)
41 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory)
42 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi)
43 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work)
44 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet)
45 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong)
46 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks)
47 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor)
48 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs)
49 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp)
50 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53)
51 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)
52 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea)
53 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly)
54 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does)
55 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module)
56 | - [The future](deep-dive.md#the-future)
57 | 4. [Disclaimers](README.md#disclaimers)
58 | 5. [Acknowledgments & References](README.md#acknowledgments--references)
59 | 6. [License](README.md#license)
60 |
61 | ---
62 |
63 | ### Installation
64 |
65 | #### Proxmox - premade packages (easy)
66 | As I believe in *[eating your own dog food](https://en.wikipedia.org/wiki/Eating_your_own_dog_food)* I run the kernel
67 | described here. Thus, I publish precompiled packages.
68 |
69 | 1. Go to the [releases tab](https://github.com/Aterfax/relax-intel-rmrr/releases) and pick appropriate packages
70 | 2. Download `release.zip`, unzip it and `cd` down to the bottom of the directory tree. (You can copy links and use `wget https://...` and `unzip release.zip` on the server itself)
71 | 3. *(OPTIONAL)* Verify the release signature on ``release.zip`` as discussed here: https://github.com/Aterfax/relax-intel-rmrr/discussions/16
72 | 4. Install all using `dpkg -i *.deb` in the folder where you downloaded the debs
73 | 5. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version
74 | ending with `-pve-relaxablermrr`
75 | 6. [Configure the kernel](README.md#configuration)
76 |
77 | ---
78 | #### Docker - build packages from sources (intermediate)
79 |
80 | #### Prerequisites
81 | 1. Docker installed (tested on Ubuntu 22.04 & Debian 10).
82 | 2. ~40GB of free space.
83 | 3. Git clone of this repo (if building the image yourself.)
84 |
85 | #### Steps
86 |
87 | 1. (Optional) Build the container image yourself from the top level of the cloned repo (Dockerfile will be present):
88 |
89 | `docker build -t relaxable-rmrr-proxmox-kernel-builder .`
90 |
91 | 2. Run the Docker image with an appropriate host file system binding (you can just pull the image direct from DockerHub, adjust the command below to the correct image name if you are building yourself):
92 |
93 | `docker run --name relaxable-rmrr-proxmox-kernel-builder -v /mnt/scratch/proxmox-kernel-build-area/proxmox-kernel:/build/proxmox/proxmox-kernel -it aterfax/relaxable-rmrr-proxmox-kernel-builder:latest`
94 |
95 | 3. Wait until the build finishes (30 - 300 minutes depending on hardware used) and find the debs on your host file system path e.g.
96 |
97 | `/mnt/scratch/proxmox-kernel-build-area/proxmox-kernel/debs`
98 |
99 | 4. Now you can [install debs like you would premade packages](README.md#proxmox---premade-packages-easy).
100 |
101 | 5. [Configure the kernel](README.md#configuration)
102 |
103 | Note: If you want to build specific versions you can override the entrypoint from `bash -c "cd /build/proxmox/ && ./build_latest.sh"` to a script version of your choosing e.g. `bash -c "cd /build/proxmox/ && ./build7.1-10.sh"`
104 |
105 | 6. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.)
106 |
107 | ---
108 |
109 | #### Proxmox - building from sources (advanced)
110 | If you're running a version of Proxmox with [no packages available](README.md#proxmox---premade-packages-easy) you can
111 | [compile the kernel yourself using patches provided](build/proxmox/).
112 |
113 | ---
114 |
115 | #### Other distros
116 | 1. Download kernel sources appropriate for your distribution
117 | 2. Apply an appropriate patch to the source tree
118 | - Go to the folder with your kernel source
119 | - For Linux 3.17 - 5.7: `patch -p1 < ../patches/add-relaxable-rmrr-below-5_8.patch`
120 | - For Linux >=5.8: `patch -p1 < ../patches/add-relaxable-rmrr-5_8_and_up.patch`
121 | 3. Follow your distro kernel compilation & installation instruction:
122 | - [Debian](https://wiki.debian.org/BuildADebianKernelPackage)
123 | - [Ubuntu](https://wiki.ubuntu.com/Kernel/BuildYourOwnKernel)
124 |
125 | ---
126 |
127 | ### Configuration
128 | By default, after the kernel is installed, the patch will be *inactive* (i.e. the kernel will behave like this patch was
129 | never applied). To activate it you have to add `intel_iommu=relax_rmrr` to your Linux boot args.
130 |
131 | In most distros (including Proxmox) you do this by:
132 | 1. Opening `/etc/default/grub` (e.g. using `nano /etc/default/grub`)
133 | 2. Editing the `GRUB_CMDLINE_LINUX_DEFAULT` to include the option:
134 | - Example of old line:
135 | ```
136 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on iommu=pt intremap=no_x2apic_optout"
137 | ```
138 | - Example of new line:
139 | ```
140 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on,relax_rmrr iommu=pt intremap=no_x2apic_optout"
141 | ```
142 | - *Side note: these are actually options which will make your PCI passthrough work and do so efficiently*
143 | 3. If not running Proxmox, running the `update-grub` command and consulting your Linux distro's instructions on kernel pinning to pin your chosen kernel.
144 | 4. If using Proxmox, running the `proxmox-boot-tool kernel list` command to list your available kernels and then 'pinning' your chosen version with the `proxmox-boot-tool` e.g. `proxmox-boot-tool kernel pin 6.2.11-1-pve-relaxablermrr`
145 | 5. Making sure to take a note or making a calendar event to keep updating your kernels and repinning new releases!
146 | 6. Rebooting
147 |
148 | To verify if the the patch is active execute `dmesg | grep 'Intel-IOMMU'` after reboot. You should see a result similar
149 | to this:
150 |
151 | ```
152 | root@sandbox:~# dmesg | grep 'Intel-IOMMU'
153 | [ 0.050195] DMAR: Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss
154 | root@sandbox:~#
155 | ```
156 |
157 | ---
158 |
159 | ### Disclaimers
160 | - I'm not a kernel programmer by any means, so if I got something horribly wrong correct me please :)
161 | - This path should be safe, as long as you don't try to remap devices which are used by the IPMI/BIOS, e.g.
162 | - Network port shared between your IPMI and OS
163 | - RAID card in non-HBA mode with its driver loaded on the host
164 | - Network card with monitoring system installed on the host (e.g. [Intel Active Health System Agent](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229))
165 | - This is not a supported solution by any of the vendors. In fact this is a direct violation of Intel's VT-d specs
166 | (which Linux already violates anyway, but this is increasing the scope). It may cause crashes or major instabilities.
167 | You've been warned.
168 |
169 | ---
170 |
171 | ### Acknowledgments & References
172 | - [Comment-out hack research by dschense](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)
173 | - [Proxmox kernel compilation & patching by Feni](https://forum.proxmox.com/threads/compile-proxmox-ve-with-patched-intel-iommu-driver-to-remove-rmrr-check.36374/)
174 | - [Linux IOMMU Support](https://www.kernel.org/doc/html/latest/x86/intel-iommu.html)
175 | - [RedHat RMRR EXCLUSION Whitepaper](https://access.redhat.com/sites/default/files/attachments/rmrr-wp1.pdf)
176 | - [Intel® Virtualization Technology for Directed I/O (VT-d)](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html)
177 | - [Intel® Virtualization Technology for Directed I/O Architecture Specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html)
178 |
179 | ---
180 |
181 | ### License
182 | This work (patches & docs) is dual-licensed under MIT and GPL 2.0 (or any later version), which should be treated as an
183 | equivalent of Linux `Dual MIT/GPL` (i.e. pick a license you prefer).
184 |
--------------------------------------------------------------------------------
/build/proxmox/README.md:
--------------------------------------------------------------------------------
1 | ## Proxmox - building from sources
2 |
3 | If you're running a version of Proxmox with [no packages available](../../README.md#proxmox---premade-packages-easy), or
4 | for some reason you don't/can't trust precompiled packages you can compile the kernel yourself using patches provided.
5 |
6 | The easiest way to do it is to clone this repository and use the build script provided, alongside this `README.md` file
7 | ([`build/proxmox/build_latest.sh`](build_latest.sh))
8 |
9 |
10 | ### How to do it WITHOUT Docker?
11 | This is mostly intended if you want to build & run on your Proxmox host. Jump to [Docker-ized](README.md#how-to-do-it-with-docker)
12 | guide if you want to build packages in an isolated environment.
13 |
14 | #### Prerequisites
15 | 1. Proxmox 6/7 install (recommended) or Debian Buster/Bullseye *(it WILL fail on Ubuntu!)*
16 | 2. Root access.
17 | 3. ~40GB of free space.
18 |
19 | #### Steps
20 | 1. Clone the repo and `cd` to the `build/proxmox/` directory.
21 | 2. Run the [`build_latest.sh`](build.sh) script from terminal:
22 | `RMRR_AUTOINSTALL=1 bash ./build_latest.sh`
23 | *You can also manually execute commands in the script step-by-step. To facilitate that the script contains
24 | extensive comments for every step.*
25 |
26 | 3. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version
27 | ending with `-pve-relaxablermrr`
28 | 4. [Configure the kernel](../../README.md#configuration)
29 | 5. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.)
30 |
31 | This process will also leave precompiled `*.deb` packages, in case you want to copy them to other Proxmox hosts you have.
32 |
33 | ---
34 |
35 | ### How to do it WITH Docker?
36 | This is mostly intended for building packages for later use (and/or when you don't want to mess with your OS).
37 |
38 | #### Prerequisites
39 | 1. Docker installed (tested on Ubuntu 22.04 & Debian 10).
40 | 2. ~40GB of free space.
41 | 3. Git clone of this repo (if building the image yourself.)
42 |
43 | #### Steps
44 |
45 | 1. (Optional) Build the container image yourself from the top level of the cloned repo (Dockerfile will be present):
46 |
47 | `docker build -t relaxable-rmrr-proxmox-kernel-builder .`
48 |
49 | 2. Run the Docker image with an appropriate host file system binding (you can just pull the image direct from DockerHub, adjust the command below to the correct image name if you are building yourself):
50 |
51 | `docker run --name relaxable-rmrr-proxmox-kernel-builder -v /mnt/scratch/proxmox-kernel-build-area/proxmox-kernel:/build/proxmox/proxmox-kernel -it aterfax/relaxable-rmrr-proxmox-kernel-builder:latest`
52 |
53 | 3. Wait until the build finishes (30 - 300 minutes depending on hardware used) and find the debs on your host file system path e.g.
54 |
55 | `/mnt/scratch/proxmox-kernel-build-area/proxmox-kernel/debs`
56 |
57 | 4. Now you can [install debs like you would premade packages](../../README.md#proxmox---premade-packages-easy).
58 |
59 | 5. [Configure the kernel](../../README.md#configuration)
60 |
61 | Note: If you want to build specific versions you can override the entrypoint from `bash -c "cd /build/proxmox/ && ./build_latest.sh"` to a script version of your choosing e.g. `bash -c "cd /build/proxmox/ && ./build7.1-10.sh"`
62 |
63 | 6. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.)
64 |
65 |
66 |
--------------------------------------------------------------------------------
/build/proxmox/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | : "${PVE_KERNEL_BRANCH:=master}"
5 | : "${RELAX_INTEL_GIT_REPO:=https://github.com/kiler129/relax-intel-rmrr.git}"
6 | : "${PROXMOX_PATCH:=proxmox.patch}"
7 | : "${RELAX_PATCH:=proxmox.patch}"
8 |
9 | echo '###########################################################'
10 | echo '################ Settings ################################'
11 | echo '###########################################################'
12 |
13 | echo "PVE_KERNEL_BRANCH:${PVE_KERNEL_BRANCH}"
14 | echo "RELAX_INTEL_GIT_REPO:${RELAX_INTEL_GIT_REPO}"
15 | echo "PROXMOX_PATCH:${PROXMOX_PATCH}"
16 | echo "RELAX_PATCH:${RELAX_PATCH}"
17 |
18 |
19 | #################################################################################
20 | # This script is a part of https://github.com/kiler129/relax-intel-rmrr project #
21 | #################################################################################
22 |
23 |
24 | echo '###########################################################'
25 | echo '############# STEP 0 - PERFORM SANITY CHECKS ##############'
26 | echo '###########################################################'
27 | # Make sure script is working in the directory it is located in
28 | cd "$(dirname "$(readlink -f "$0")")"
29 | SCRIPT_DIR=$(pwd)
30 |
31 |
32 | # Build process will fail if you're not a root (+ apt actions itself need it)
33 | if [[ "$EUID" -ne 0 ]]
34 | then echo "This script should be run bash root"
35 | exit 1
36 | fi
37 |
38 | # Sanity check: make sure no two builds are started nor we have something leftover from previous attempts
39 | if [[ -f "$SCRIPT_DIR/script_running" ]]; then
40 | echo "This script already appears to be running or has not cleaned up correctly."
41 | echo "To continue please remove $SCRIPT_DIR/script_running if you are sure a script is not already running."
42 | exit 1
43 | fi
44 |
45 | # Set the lockfile.
46 | touch $SCRIPT_DIR/script_running
47 |
48 | if [[ -d "proxmox-kernel" ]]; then
49 |
50 | echo 'Directory "proxmox-kernel" already exists.'
51 | cd proxmox-kernel
52 |
53 | echo "Cleaning debs previous dir if present."
54 | if [[ -d "debs" ]]; then rm -rf debs; fi
55 |
56 | if [[ -d "pve-kernel" ]]; then
57 | echo 'Directory "pve-kernel" already exists - resetting cloned Git repositories.'
58 | cd pve-kernel
59 | git clean -xfd
60 | git submodule foreach --recursive git clean -xfd
61 | git reset --hard
62 | git pull
63 | git checkout ${PVE_KERNEL_BRANCH}
64 | git submodule foreach --recursive git reset --hard
65 | git submodule update --init --recursive
66 | PVE_KERNEL_GIT_DIR_PRESENT=1
67 | cd ..
68 | fi
69 |
70 | if [[ -d "relax-intel-rmrr" ]]; then
71 | cd relax-intel-rmrr
72 | git reset --hard
73 | RELAX_INTEL_RMRR_GIT_DIR_PRESENT=1
74 | cd ..
75 | fi
76 |
77 | cd $SCRIPT_DIR
78 |
79 | fi
80 |
81 |
82 | echo '###########################################################'
83 | echo '############ STEP 1 - INSTALL ALL DEPENDENCIES ############'
84 | echo '###########################################################'
85 | # Check if Proxmox-specific package exists in apt cache. If it does it means apt already knows Proxmox repository, if
86 | # not we need to add it to properly build the kernel
87 | if apt show libpve-common-perl &>/dev/null; then
88 | echo "Step 1.0: Proxmox repository already present - not adding"
89 | else
90 | # Add Proxmox repo & their signing key
91 | echo "Step 1.0: Adding Proxmox apt repository..."
92 | apt -y update
93 | apt -y install gnupg
94 | # apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 7BF2812E8A6E88E0
95 | wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg
96 | echo 'deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription' > /etc/apt/sources.list.d/pve.list
97 | fi
98 |
99 | # Install all packages required to build the kernel & create *.deb packages for installation
100 | echo "Step 1.1: Installing build dependencies..."
101 | apt -y update
102 | apt -y install git nano screen patch fakeroot build-essential devscripts libncurses5 libncurses5-dev libssl-dev bc \
103 | flex bison libelf-dev libaudit-dev libgtk2.0-dev libperl-dev asciidoc xmlto gnupg gnupg2 rsync lintian debhelper \
104 | libdw-dev libnuma-dev libslang2-dev sphinx-common asciidoc-base automake cpio dh-python file gcc kmod libiberty-dev \
105 | libpve-common-perl libtool perl-modules python3-minimal python3-dev sed tar zlib1g-dev lz4 curl zstd dwarves
106 |
107 |
108 |
109 | echo '###########################################################'
110 | echo '############ STEP 2 - DOWNLOAD CODE TO COMPILE ############'
111 | echo '###########################################################'
112 | # Create working directory
113 | echo "Step 2.0: Creating working directory"
114 | mkdir -p proxmox-kernel
115 | cd proxmox-kernel
116 |
117 | # Clone official Proxmox kernel repo & Relaxed RMRR Mapping patch if not already present.
118 | echo "Step 2.1: Downloading Proxmox kernel toolchain & patches"
119 |
120 | if [[ $PVE_KERNEL_GIT_DIR_PRESENT -ne 1 ]]; then
121 | git clone git://git.proxmox.com/git/pve-kernel.git
122 | fi
123 |
124 | if [[ $RELAX_INTEL_RMRR_GIT_DIR_PRESENT -ne 1 ]]; then
125 | git clone --depth=1 ${RELAX_INTEL_GIT_REPO}
126 | fi
127 |
128 | # Go to the actual Proxmox toolchain
129 | cd pve-kernel
130 |
131 | #Checkout the correct branch
132 | git checkout ${PVE_KERNEL_BRANCH}
133 |
134 | echo "Showing Git status:"
135 | git status
136 |
137 | # (OPTIONAL) Download flat copy of Ubuntu hirsute kernel submodule
138 | # If you skip this the "make" of Proxmox kernel toolchain will download a copy (a Proxmox kernel is based on Ubuntu
139 | # If you skip this the "make" of Proxmox kernel toolchain will download a copy (a Proxmox kernel is based on Ubuntu
140 | # hirsute kernel). However, it will download it with the whole history etc which takes A LOT of space (and time). This
141 | # bypasses the process safely.
142 | # This curl skips certificate validation because Proxmox GIT WebUI doesn't send Let's Encrypt intermediate cert
143 | echo "Step 2.2: Downloading base kernel"
144 | #TODO: This needs a proxmox7 fix
145 | # curl -k "https://git.proxmox.com/?p=mirror_ubuntu-hirsute-kernel.git;a=snapshot;h=$(git submodule status submodules/ubuntu-hirsute | cut -c 2-41);sf=tgz" --output kernel.tgz
146 | # tar -xf kernel.tgz -C submodules/ubuntu-hirsute/ --strip 1
147 | # rm kernel.tgz
148 |
149 |
150 |
151 | echo '###########################################################'
152 | echo '################# STEP 3 - CREATE KERNEL ##################'
153 | echo '###########################################################'
154 | echo "Step 3.0: Applying patches"
155 | #cp ../relax-intel-rmrr/patches/${RELAX_PATCH} ./patches/kernel/CUSTOM-add-relaxable-rmrr.patch
156 | #cp ../relax-intel-rmrr/patches/relaxable-rmrr-patch-sed.txt ./patches/kernel/
157 | sed -i '/^$(KERNEL_SRC).prepared: $(KERNEL_SRC_SUBMODULE) | submodule/r ../../../../patches/relaxable-rmrr-patch-sed.txt' Makefile
158 | patch -p1 < ../relax-intel-rmrr/patches/${PROXMOX_PATCH}
159 |
160 |
161 | echo "Step 3.1: Compiling kernel... (it will take 30m-3h)"
162 | # Note: DO NOT add -j to this make, see https://github.com/kiler129/relax-intel-rmrr/issues/1
163 | # This step will compile kernel & build all *.deb packages as Proxmox builds internally
164 | make clean
165 | make
166 |
167 |
168 | echo '###########################################################'
169 | echo '################ STEP 4 - INSTALL KERNEL ##################'
170 | echo '###########################################################'
171 | echo "Step 4: Installing packages"
172 |
173 | if [[ -v RMRR_AUTOINSTALL ]]; then
174 | apt install ./*.deb
175 | else
176 | echo '=====>>>> SKIPPED - to enable autoinstallation set "RMRR_AUTOINSTALL" environment variable.'
177 | echo '=====>>>> To install execute "dpkg -i *.deb" after this script finishes'
178 | fi
179 |
180 | echo '###########################################################'
181 | echo '################## STEP 5 - CLEANUP #######################'
182 | echo '###########################################################'
183 | # Remove all (~30GB) of stuff leftover after compilation
184 | echo "Step 5: Cleaning up..."
185 | cd ..
186 | mkdir -p $SCRIPT_DIR/proxmox-kernel/debs
187 | mv pve-kernel/*.deb $SCRIPT_DIR/proxmox-kernel/debs
188 | #rm -rf pve-kernel
189 | #rm -rf relax-intel-rmrr
190 |
191 | # Remove the lockfile.
192 | rm $SCRIPT_DIR/script_running
193 | exit 0
194 |
--------------------------------------------------------------------------------
/build/proxmox/build7.1-10.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | export PVE_KERNEL_BRANCH=pve-kernel-5.13
5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git"
6 | export RELAX_PATCH="add-relaxable-rmrr-5_13.patch"
7 | export PROXMOX_PATCH="proxmox7.patch"
8 |
9 | ./build.sh
10 |
--------------------------------------------------------------------------------
/build/proxmox/build7.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | export PVE_KERNEL_BRANCH=pve-kernel-5.13
5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git"
6 | export RELAX_PATCH="add-relaxable-rmrr-5_13.patch"
7 | export PROXMOX_PATCH="proxmox7.patch"
8 |
9 | ./build.sh
10 |
--------------------------------------------------------------------------------
/build/proxmox/build_latest.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | export PVE_KERNEL_BRANCH=master
5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git"
6 | export RELAX_PATCH="add-relaxable-rmrr-5_15.patch"
7 | export PROXMOX_PATCH="proxmox7.patch"
8 |
9 | ./build.sh
10 |
--------------------------------------------------------------------------------
/build/proxmox/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | build-cont:
4 | image: build-img
5 | build: .
6 | stdin_open: true
7 | tty: true
8 | volumes:
9 | - ./debs:/build/proxmox-kernel/debs
--------------------------------------------------------------------------------
/deep-dive.md:
--------------------------------------------------------------------------------
1 | ### Deep Dive into the problem
2 |
3 | ### Table of Contents
4 | 1. [Installation](README.md#installation)
5 | - [Proxmox - premade packages](README.md#proxmox---premade-packages)
6 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources)
7 | - [Other distros](README.md#other-distros)
8 | 2. [Configuration](README.md#configuration)
9 | 3. **Deep Dive** <= you're here
10 | - [Technical details](deep-dive.md#technical-details)
11 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory)
12 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi)
13 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work)
14 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet)
15 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong)
16 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks)
17 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor)
18 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs)
19 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp)
20 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53)
21 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)
22 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea)
23 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly)
24 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does)
25 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module)
26 | - [The future](deep-dive.md#the-future)
27 | 4. [Disclaimers](README.md#disclaimers)
28 | 5. [Acknowledgments & References](README.md#acknowledgments--references)
29 | 6. [License](README.md#license)
30 |
31 | ---
32 |
33 | ### Technical details
34 |
35 | #### How virtual machines use memory?
36 | To understand PCI passthrough we first need to understand how VMs work. Each VM launched in the system gets a new
37 | virtual address space and has no direct access to the host memory. Yet, the guest OS runs like it was running with a
38 | real RAM, using any memory addresses it wants. In other words the guest OS has no idea (in terms of memory) that it is
39 | being virtualized. Logically there has to be some map to translate guest OS requests to the real memory addresses, since
40 | multiple guest OSes has to share the same physical host memory. The hypervisor (host OS) is responsible for maintaining
41 | a map between GPA (Guest Address Space) and HPA (Host Physical Address). To better understand this look at the (VERY
42 | simplified) graphics:
43 |
44 | ```
45 | +--------------------------------HOST----------------------------------------+
46 | | |
47 | | +--------------------------HOST MEMORY-------------------------------+ |
48 | | | +-------+ +----------GUEST MEMORY-----------+ | |
49 | | | | vim | |---------------------------------| | |
50 | | | | mem | |---------------------------------| | |
51 | | | +-------+ +---------------------------------+ | |
52 | | | 0xA000 0xA100 | |
53 | | +--------------------------------------------------------------------+ |
54 | | 0x0000 0xF000 0xF0FF 0x....|
55 | | |
56 | | +--------+ +----------------GUEST VM------------------+ |
57 | | | | | +------------GUEST MEMORY--------------+ | |
58 | | | vim | | | | | | | |
59 | | | | | | guest kernel| wget | | | |
60 | | +--------+ | | | mem | | | |
61 | | | +-------------+--------+---------------+ | |
62 | | | 0x00 0x1E 0x20 0xFF | |
63 | | | +------+ | |
64 | | | | wget | | |
65 | | | +------+ | |
66 | | +------------------------------------------+ |
67 | +----------------------------------------------------------------------------+
68 |
69 | (addresses don't represent real x86 space[!] and are not drawn to scale)
70 | ```
71 |
72 | When a VM is run the hypervisor gives it a predetermined amount of memory and tells the gust OS that it has a contagious
73 | space of 255 bytes. The guest OS knows it can use 255 bytes from 0x00 and doesn't care/know where this memory physically
74 | resides. Host OS now needs to find space for 255 bytes, either in one or multiple chunks in the physical memory. It can
75 | map it as on the diagram to one big chunk or split it into multiple ones, as long as it can map guest request for its
76 | `0x1E`-`0x20` to e.g. `0xF010`-`0xF012` and return the data.
77 |
78 | ---
79 |
80 | #### Why do we need VT-d / AMD-Vi?
81 | While mapping the memory (as described in the previous section) the host OS must take care of three things:
82 | 1. When guest OS requests a page from memory using its (GPA) address it will get it from the HPA-addressed memory (=mapping)
83 | 2. Memory of the guest cannot be touched by anything other than the guest (=protection)
84 | 3. The process needs to be fast
85 |
86 | While the first two are achievable with pure software emulation, it makes the memory access process slow as molasses
87 | since it can no longer rely on [DMA](https://en.wikipedia.org/wiki/Direct_memory_access) but involve CPU for every
88 | shifting bytes back and forth.
89 | Both VT-d and AMD-Vi allow to essentially instruct the hardware to do the mapping and enforce domains (security
90 | boundaries). In such case host OS simply needs to inform the hardware about the address to be translated on-the-fly.
91 |
92 | More on that can be found in the [Intel VT-d docs](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html).
93 |
94 | ---
95 |
96 | #### How PCI/PCIe actually work?
97 | Most people blindly plop `intel_iommu=on` and `iommu=pt` into their kernel line and get surprised when things don't
98 | work. I did too, so I started digging, which resulted in this whole repository.
99 |
100 | Every device in the system has some memory reserved memory address space. It's used by the device and the the host
101 | system to communicate and exchange data. That reserved memory address is dictated by the firmware (i.e. BIOS) as both
102 | the device and OS must know it to communicate. In essence this is just slightly different than normal memory mapping.
103 | Here, you don't have just some OS using the memory but an OS **and** a device using the memory.
104 |
105 | Here's where [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) comes into play. In essence it's
106 | able to remap GPA to HPA for both the OS and the device so that they can talk to each other. When device memory is
107 | remapped the guest OS talks to the hardware like it was really under some physical address it expects, while in reality
108 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) moves the reserved region aperture
109 | somewhere else in the address space. This is *usually* fine.
110 |
111 | ---
112 |
113 | #### RMRR - the monster in a closet
114 | While both AMD and Intel allow for [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) remapping
115 | device's memory, Intel had an idea to introduce RMRR (Reserved Memory Region Reporting). In essence the firmware/BIOS
116 | publishes a list of regions where usage of [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) is
117 | ostensibly prohibited. The original intent for that feature was good, by allowing for USB keyboards to be automagically
118 | emulated by the USB controller itself before USB driver is loaded, like they were connected via PS/2. This also allow
119 | the GPU to display the picture before OS is loaded and even before [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
120 | is initialized.
121 | However, it required some sacrifices: that memory should not be remapped as only OS and the device use the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
122 | and devices on the motherboard which may be communicating with e.g. the GPU pre-boot don't know anything about the
123 | mapping.
124 |
125 | However, one *undocumented assumption* was made: as soon as the driver is loaded the "out-of-band" access to the device
126 | ends and the the OS takes over. However, *technically* the VT-d specification says that the RMRR is valid indefinitely.
127 |
128 | Linux for long time (up until [v3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee))
129 | didn't respect RMRR while setting up [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
130 | resptcing that against-the-specs but ubiquitous assumption. This was an oversight as [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
131 | API assumes exclusive control over the remapped address space. If such space is remapped the DMA access from outside of
132 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) domain (i.e. from something else than the
133 | host or VM guest OS, like a device on the motherboard) will fail which may lead to unpredictable results if the hardware
134 | vendor didn't follow the *undocumented assumption*.
135 |
136 |
137 | Linux, as of now, excludes two specific classes of devices form being constricted by RMRR:
138 | - USB devices (as we historically trust they don't do weird things)
139 | - GPUs (unspoken rule that they're accessed out-of-band only before the driver loads)
140 |
141 |
142 | RMRR *by itself* isn't evil, as long as it's used as [Intel's VT-d specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html)
143 | intended - "*[RMRRs] that are either not DMA targets, or memory ranges that may be target of BIOS
144 | initiated DMA only during pre-boot phase (such as from a boot disk drive) **must not** be included in the reserved
145 | memory region reporting.*".
146 |
147 |
148 | Intel anticipated the some will be tempted to misuse the feature as they warned in the VT-d specification: "*RMRR
149 | regions are expected to be used for legacy usages (...). Platform designers should avoid or limit use of reserved memory
150 | regions*".
151 |
152 | ----
153 |
154 | #### What vendors did wrong?
155 | HP (and probably others) decided to mark **every freaking PCI device memory space as RMRR!**`*` Like that,
156 | just in case... just that their tools could potentially maybe monitor these devices while OS agent is not installed. But
157 | wait, there's more! They marked **ALL** devices as such, even third party ones physically installed in motherboard's
158 | PCI/PCIe slots!
159 |
160 | This in turn killed PCI passthrough for any of the devices in systems running Linux [>=3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee).
161 |
162 | *`*` In case you skipped other sections above, RMRR is a special part of the memory which cannot be moved
163 | to a VM.*
164 |
165 | ---
166 |
167 | ### Other solutions & hacks
168 |
169 | #### Contact your platform vendor
170 | As the error suggests you can try to convince your vendor to fix the BIOS. If you do please create an issue in this repo
171 | to tell me about it, as this is **the only** real solution to the problem.
172 |
173 | ---
174 |
175 | #### Use OS which ignores RMRRs
176 | Some operating systems, notably [VMWare ESXi and vSphere](https://www.vmware.com/products/esxi-and-esx.html), are
177 | believed to ignore RMRRs (cannot be verified as they're closed-source). They're able to passthrough the devices without
178 | a problem, as long as you don't do something deliberately dangerous (see [Disclaimers](README.md#disclaimers)).
179 |
180 | ---
181 |
182 | #### Attempt HPE's pseudofix (if you use HP)
183 | To HPE's credit, they [recognized the problem and released an advisory with mitigations](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229).
184 | In short the HPE's solution is threefold:
185 | 1. Fix the firmware to not include GPUs in RMRR
186 | 2. Use System Configuration utility on Gen9+ servers to disable "HP Shared Memory features" on selected HPs cards
187 | 3. Use their CLI BIOS/RBSU reconfiguration utility to set a special (invisible in menus) flags opting-out PCIe slots
188 | from "smart monitoring"
189 |
190 | However, we wouldn't be here if it actually worked as expected:
191 | - Fix 1 works only on GPUs and affects Linux 3.17-5.4 (as kernel has GPU exclusion since 5.4)
192 | - Fix 2 only works on *some* **external** HPE ethernet adapters with Gen9 and newer servers
193 | - Fix 3 theoretically works on all NICs, but not other cards (e.g. HBAs) and [doesn't actually work](https://community.hpe.com/t5/proliant-servers-netservers/microserver-gen8-quot-device-is-ineligible-for-iommu-domain/td-p/6947461#.X5D7SS9h1TY)
194 | (sic!) on some servers which are listed as affected (e.g. widely popular [HP/HPE Microserver Gen8](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c03793258))
195 |
196 | Some tried [opening a support case](https://community.hpe.com/t5/proliant-servers-netservers/re-device-is-ineligible-for-iommu-domain-attach-due-to-platform/m-p/6817728/highlight/true#M21006)
197 | but the topic dried out. I tried [nagging HPE to fix the BIOS](https://community.hpe.com/t5/proliant-servers-ml-dl-sl/disabling-rmrds-rmrr-hp-shared-memory-features-on-microserver/td-p/7105623#.X5C0oy9h2uV).
198 | Maybe there's a chance? Who knows... the future will show.
199 |
200 | ---
201 |
202 | #### The comment-the-error-out hack (v3.17 - 5.3)
203 | I was able to track the first mentions of this method to [a post by dschense on a German Proxmox forum](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)
204 | ([en version](https://translate.googleusercontent.com/translate_c?depth=2&pto=aue&rurl=translate.google.com&sl=de&tl=en&u=https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)).
205 |
206 | In essence this was a logical conclusion: if you have an error comment it out and see what happens. It worked on the
207 | original protection being introduced in Linux v3.17. Unfortunately, the Linux v5.3 changed a lot (see [next section](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)).
208 |
209 | ---
210 |
211 | #### Long-term solution - utilizing relaxable reservation regions (>=3.17)
212 |
213 | ##### Why commenting-out the error is a bad idea
214 | Before Linux v5.3 RMRRs protection relied on [a simple patch introduced in v3.17](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee)
215 | which excluded USB devices. [Commenting out the error](#the-comment-the-error-out-hack-v317---53) was a working
216 | solution, as the kernel (including KVM subsystem) didn't care about the reserved regions.
217 |
218 | The situation changed dramatically. A large change aimed to [introduce IOVA list management](https://patchwork.kernel.org/project/kvm/cover/20190723160637.8384-1-shameerali.kolothum.thodi@huawei.com/)
219 | outside of the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) driver was introduced. About
220 | the same time the RMRRs reserved memory [was split into two logical buckets](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12):
221 | absolutely-reserved (`IOMMU_RESV_DIRECT`) and so-called relaxed (`IOMMU_RESV_DIRECT_RELAXABLE`). USB devices and now
222 | GPUs were marked as *"relaxable"* as they were deemed safe to be remapped (even if against the VT-d specs and
223 | firmware's will).
224 |
225 |
226 | ##### The kernel moves on quickly
227 | Other subsystems naturally [started utilizing](https://github.com/torvalds/linux/commit/9b77e5c79840fc334a5b7f770c5ab0c09dc0e028)
228 | that new IOVA interface, which broke the *"[comment-the-error-out](#the-comment-the-error-out-hack-v317---53)"* patch.
229 | Now with the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) error message commented out QEMU
230 | [will explode on vfio_dma_map()](https://bugs.launchpad.net/qemu/+bug/1869006/comments/14).
231 | Understandably, and for good reasons, [developers refuses to accommodate any requests to disable that](https://bugs.launchpad.net/qemu/+bug/1869006/comments/18).
232 | While even more checks can be commented-out and patched, as more subsystems in the kernel start relying on the IOVA
233 | lists management, it will be a cat-and-mouse game after every kernel release.
234 |
235 |
236 | ##### What this patch actually does
237 | The path plugs into the same mechanism as the vanilla kernel used to [mark USB and GPUs as "relaxable"](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12).
238 | This has three benefits:
239 | - The RMRR is not fully NULLified, as the memory is marked as reserved-with-exceptions and not just not reserved. This,
240 | combined with IOVA list management ensures that if some code somewhere needs to work differently with relaxable
241 | devices it will work with this patch properly.
242 | - This patch doesn't introduce inconsistent state in the kernel. RMRRs are not hidden from the kernel by removal, nor
243 | ignored just in one place. This patch just changes the designation of these regions from `IOMMU_RESV_DIRECT` (*"we
244 | know it's reserved and we will hold your hand"*) to [`IOMMU_RESV_DIRECT_RELAXABLE`](https://lore.kernel.org/patchwork/patch/1079954/)
245 | (*"we know it's reserved but it's your playground"*).
246 | - It works across all affected kernels (v5.9.1 being the newest at the time of writing)
247 |
248 | Additionally, this mechanism is [controllable with a boot option](README.md#configuration) making it safe and easy to
249 | disable as needed.
250 |
251 |
252 | ##### Why kernel patch and not a loadable module?
253 | Before taking this approach I poked around to see if the [IOMM driver](https://github.com/torvalds/linux/tree/master/drivers/iommu/intel)
254 | has any API around RMRR. It does not. The driver doesn't export any functions which can make the module feasible.
255 | While Linux >=5.3 has the IOVA list management interface, it is [being built by the Intel IOMMU driver](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12).
256 | What it means is the hardcoded relaxable logic [decides about IOVA designation](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12#diff-e1fff7a2368c04e11696812359f854de9da431c63ec7c5a7bec8f6020e112a2aR2916).
257 | Late on the same logic is [used for final sanity](https://github.com/torvalds/linux/blob/5f9e832c137075045d15cd6899ab0505cfb2ca4b/drivers/iommu/intel-iommu.c#L5057)
258 | independently from the state of the memory saved in the IOVA list. Only after this check passes the IOMMU mapping is
259 | added.
260 |
261 | In other words even if >=5.4 [IOVA API is used to modify](https://github.com/torvalds/linux/commit/af029169b8fdae31064624d60b5469a3da95ad32)
262 | the assignment, the actual IOMU remapping will fail with *"Device is ineligible for IOMMU domain attach..."* error.
263 |
264 |
265 | #### The future
266 | It will be great if this patch could be upstreamed. However, I see slim-to-none chance of that happening, as this change
267 | is prone to abuse. However, I will definitely try to communicate with kernel folks on how to proceed.
268 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_11.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2021-07-30 16:21:22.235520365 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2021-07-30 16:28:28.905719413 +0100
3 | @@ -355,6 +355,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -455,7 +456,10 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | - }
16 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
17 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
18 | + intel_relaxable_rmrr = 1;
19 | + }
20 |
21 | str += strcspn(str, ",");
22 | while (*str == ',')
23 | @@ -2802,7 +2806,7 @@
24 | return false;
25 |
26 | pdev = to_pci_dev(dev);
27 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
28 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
29 | return true;
30 | else
31 | return false;
32 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_13.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2022-02-26 13:51:33.821885509 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-26 13:58:27.231463792 +0100
3 | @@ -364,6 +364,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -465,6 +466,9 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | } else {
19 | pr_notice("Unknown option - '%s'\n", str);
20 | }
21 | @@ -2846,7 +2850,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_15.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2022-05-04 18:50:13.078092713 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2022-05-04 18:45:09.909672434 +0100
3 | @@ -345,6 +345,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -450,6 +451,9 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | } else {
19 | pr_notice("Unknown option - '%s'\n", str);
20 | }
21 | @@ -2832,7 +2836,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_8_and_up.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c
2 | +++ b/drivers/iommu/intel/iommu.c
3 | @@ -356,6 +356,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int intel_no_bounce;
7 | +static int intel_relaxable_rmrr = 0;
8 | static int iommu_skip_te_disable;
9 |
10 | #define IDENTMAP_GFX 2
11 | @@ -463,6 +464,9 @@
12 | } else if (!strncmp(str, "nobounce", 8)) {
13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
14 | intel_no_bounce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | }
19 |
20 | str += strcspn(str, ",");
21 | @@ -2863,7 +2867,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
31 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-below-5_8.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel-iommu.c
2 | +++ b/drivers/iommu/intel-iommu.c
3 | @@ -367,6 +367,7 @@ static int intel_iommu_strict;
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int intel_no_bounce;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_ALL 1
10 | #define IDENTMAP_GFX 2
11 | @@ -468,6 +469,9 @@ static int __init intel_iommu_setup(char *str)
12 | } else if (!strncmp(str, "nobounce", 8)) {
13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
14 | intel_no_bounce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | }
19 |
20 | str += strcspn(str, ",");
21 | @@ -2866,7 +2870,7 @@ static bool device_rmrr_is_relaxable(struct device *dev)
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/patches/proxmox.patch:
--------------------------------------------------------------------------------
1 | --- a/Makefile
2 | +++ b/Makefile
3 | @@ -11,7 +11,7 @@
4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN)
5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL)
6 |
7 | -EXTRAVERSION=-${KREL}-pve
8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr
9 | KVNAME=${KERNEL_VER}${EXTRAVERSION}
10 | PACKAGE=pve-kernel-${KVNAME}
11 | HDRPACKAGE=pve-headers-${KVNAME}
12 |
--------------------------------------------------------------------------------
/patches/proxmox7.patch:
--------------------------------------------------------------------------------
1 | --- a/Makefile
2 | +++ b/Makefile
3 | @@ -13,7 +13,7 @@ PKGREL=1
4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN)
5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL)
6 |
7 | -EXTRAVERSION=-$(KREL)-pve
8 | +EXTRAVERSION=-$(KREL)-pve-relaxablermrr
9 | KVNAME=$(KERNEL_VER)$(EXTRAVERSION)
10 | PACKAGE=pve-kernel-$(KVNAME)
11 | HDRPACKAGE=pve-headers-$(KVNAME)
12 |
--------------------------------------------------------------------------------
/patches/relaxable-rmrr-patch-sed.txt:
--------------------------------------------------------------------------------
1 | sed -i '/^static int iommu_skip_te_disable;.*/a static int intel_relaxable_rmrr = 0;' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c
2 | sed -i 's/if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))/if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))/g' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c
3 | sed -i '/intel_iommu_tboot_noforce = 1;/a \\ \ } else if (!strncmp(str, "relax_rmrr", 10)) {\n\ \ \ pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\\n");\n\ \ \ intel_relaxable_rmrr = 1;' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c
4 |
--------------------------------------------------------------------------------