├── .github └── workflows │ ├── build-kernel-debs-7.1.yml │ ├── build-kernel-debs.yml │ ├── docker-publish.yml │ └── docker-test.yml ├── .gitignore ├── Dockerfile ├── README.md ├── build └── proxmox │ ├── README.md │ ├── build.sh │ ├── build7.1-10.sh │ ├── build7.sh │ ├── build_latest.sh │ └── docker-compose.yaml ├── deep-dive.md └── patches ├── add-relaxable-rmrr-5_11.patch ├── add-relaxable-rmrr-5_13.patch ├── add-relaxable-rmrr-5_15.patch ├── add-relaxable-rmrr-5_8_and_up.patch ├── add-relaxable-rmrr-below-5_8.patch ├── proxmox.patch ├── proxmox7.patch └── relaxable-rmrr-patch-sed.txt /.github/workflows/build-kernel-debs-7.1.yml: -------------------------------------------------------------------------------- 1 | name: Build kernel debs v7.1 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | # * is a special character in YAML so you have to quote this string 7 | - cron: '0 4 * * 6' 8 | jobs: 9 | build-kernel-debs: 10 | runs-on: [pve-kernel] 11 | container: 12 | image: aterfax/relaxable-rmrr-proxmox-kernel-builder:latest 13 | options: -v ${{ github.workspace }}:/build/proxmox/proxmox-kernel #Note this is technically a very bad idea if your Runner is doing more than this sole action due to environment pollution. 14 | 15 | steps: 16 | 17 | - name: Pre-clean up debs if present 18 | run: bash -c 'if [[ -d "/build/proxmox/proxmox-kernel/debs" ]]; then rm -rf /build/proxmox/proxmox-kernel/debs; fi' 19 | 20 | - name: Build kernel 21 | run: cd /build/proxmox/ && ./build7.1-10.sh 22 | 23 | - name: Zip up debs 24 | run: zip -r release.zip /build/proxmox/proxmox-kernel/debs 25 | 26 | - name: Archive the generated debs 27 | uses: actions/upload-artifact@v3 28 | with: 29 | name: RMRR-Relaxation-Patched-PVE-kernel-debs-zip 30 | path: release.zip 31 | 32 | - name: Calculate release zip checksum 33 | run: bash -c 'sha256sum release.zip && md5sum release.zip' 34 | 35 | - name: Clean up release zip 36 | run: rm release.zip 37 | 38 | - name: Clean up debs if present 39 | run: bash -c 'if [[ -d "debs" ]]; then rm -rf debs; fi' 40 | -------------------------------------------------------------------------------- /.github/workflows/build-kernel-debs.yml: -------------------------------------------------------------------------------- 1 | name: Build kernel debs 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | # * is a special character in YAML so you have to quote this string 7 | - cron: '0 2 * * 6' 8 | jobs: 9 | build-kernel-debs: 10 | runs-on: [pve-kernel] 11 | container: 12 | image: aterfax/relaxable-rmrr-proxmox-kernel-builder:latest 13 | options: -v ${{ github.workspace }}:/build/proxmox/proxmox-kernel #Note this is technically a very bad idea if your Runner is doing more than this sole action due to environment pollution. 14 | 15 | steps: 16 | 17 | - name: Pre-clean up debs if present 18 | run: bash -c 'if [[ -d "/build/proxmox/proxmox-kernel/debs" ]]; then rm -rf /build/proxmox/proxmox-kernel/debs; fi' 19 | 20 | - name: Build kernel 21 | run: cd /build/proxmox/ && ./build_latest.sh 22 | 23 | - name: Zip up debs 24 | run: zip -r release.zip /build/proxmox/proxmox-kernel/debs 25 | 26 | - name: Archive the generated debs 27 | uses: actions/upload-artifact@v3 28 | with: 29 | name: RMRR-Relaxation-Patched-PVE-kernel-debs-zip 30 | path: release.zip 31 | 32 | - name: Calculate release zip checksum 33 | run: bash -c 'sha256sum release.zip && md5sum release.zip' 34 | 35 | - name: Clean up release zip 36 | run: rm release.zip 37 | 38 | - name: Clean up debs if present 39 | run: bash -c 'if [[ -d "debs" ]]; then rm -rf debs; fi' 40 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Docker Image 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | paths: 7 | - 'Dockerfile' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-and-push-docker-image: 12 | name: Build Docker image and push to repositories 13 | runs-on: ubuntu-latest 14 | steps: 15 | 16 | - name: Checkout code 17 | uses: actions/checkout@v2 18 | 19 | - name: Set up Docker Buildx 20 | id: buildx 21 | uses: docker/setup-buildx-action@v1 22 | 23 | - name: Login to DockerHub 24 | uses: docker/login-action@v1 25 | with: 26 | username: ${{ secrets.DOCKERHUB_USERNAME }} 27 | password: ${{ secrets.DOCKERHUB_TOKEN }} 28 | 29 | - name: Login to Github Packages 30 | uses: docker/login-action@v1 31 | with: 32 | registry: ghcr.io 33 | username: ${{ github.actor }} 34 | password: ${{ secrets.GHCR_PAT }} 35 | 36 | - name: Build image and push to Docker Hub and GitHub Container Registry 37 | uses: docker/build-push-action@v2 38 | with: 39 | context: . 40 | push: true 41 | tags: | 42 | aterfax/relaxable-rmrr-proxmox-kernel-builder:latest 43 | ghcr.io/aterfax/relaxable-rmrr-proxmox-kernel-builder:latest 44 | 45 | - name: Image digest 46 | run: echo ${{ steps.docker_build.outputs.digest }} 47 | -------------------------------------------------------------------------------- /.github/workflows/docker-test.yml: -------------------------------------------------------------------------------- 1 | name: Build Docker Image 2 | 3 | on: 4 | # run it during pull request 5 | pull_request: 6 | paths: 7 | - 'Dockerfile' 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | # define job to build and publish docker image 13 | build-docker-image: 14 | name: Build Docker image only 15 | # run only when code is compiling and tests are passing 16 | runs-on: ubuntu-latest 17 | 18 | # steps to perform in job 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v2 22 | 23 | # setup Docker build action 24 | - name: Set up Docker Buildx 25 | id: buildx 26 | uses: docker/setup-buildx-action@v1 27 | 28 | - name: Build image only 29 | uses: docker/build-push-action@v2 30 | with: 31 | context: . 32 | push: false 33 | 34 | - name: Image digest 35 | run: echo ${{ steps.docker_build.outputs.digest }} 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | proxmox-kernel/ 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # vim:set ft=dockerfile: 2 | 3 | # This Dockerfile builds the newest kernel with RMRR patch 4 | # 5 | # TODO Add support for custom branch of build 6 | FROM debian:bullseye 7 | 8 | RUN mkdir -p /build 9 | WORKDIR /build 10 | 11 | RUN set -x \ 12 | && apt update && apt install -y ca-certificates wget 13 | 14 | # apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 7BF2812E8A6E88E0 15 | RUN apt -y install gnupg && wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg && \ 16 | echo 'deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription' > /etc/apt/sources.list.d/pve.list 17 | 18 | RUN apt -y update 19 | 20 | RUN apt -y install git nano screen patch fakeroot build-essential devscripts libncurses5 libncurses5-dev libssl-dev bc \ 21 | flex bison libelf-dev libaudit-dev libgtk2.0-dev libperl-dev asciidoc xmlto gnupg gnupg2 rsync lintian debhelper \ 22 | libdw-dev libnuma-dev libslang2-dev sphinx-common asciidoc-base automake cpio dh-python file gcc kmod libiberty-dev \ 23 | libpve-common-perl libtool perl-modules python3-minimal python3-dev sed tar zlib1g-dev lz4 curl zstd dwarves 24 | 25 | #Need pahole 1.16 or above 26 | RUN TEMP_DEB="$(mktemp)" && \ 27 | wget -O "$TEMP_DEB" http://archive.ubuntu.com/ubuntu/pool/universe/d/dwarves-dfsg/dwarves_1.21-0ubuntu1~20.04.1_amd64.deb && \ 28 | dpkg -i "$TEMP_DEB" && \ 29 | rm -f "$TEMP_DEB" 30 | 31 | # Copy both folders into docker root filepath. 32 | COPY build /build 33 | COPY patches /patches 34 | 35 | #ENTRYPOINT ["tail", "-f", "/dev/null"] 36 | ENTRYPOINT bash -c "cd /build/proxmox/ && ./build_latest.sh" 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🍻 Relaxed RMRR Mapping for Linux 3.17+ - ARCHIVED 2 | [![Build and Publish Docker Image](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/docker-publish.yml) 3 | [![Build kernel debs](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/build-kernel-debs.yml/badge.svg)](https://github.com/Aterfax/relax-intel-rmrr/actions/workflows/build-kernel-debs.yml) 4 | ![Docker Image Size (tag)](https://img.shields.io/docker/image-size/aterfax/relaxable-rmrr-proxmox-kernel-builder/latest) 5 | ![Docker Pulls](https://img.shields.io/docker/pulls/aterfax/relaxable-rmrr-proxmox-kernel-builder) 6 | > :warning: Note - this repo is now archived as support for the Relaxed RMRR Mapping is now natively supported by the normal Proxmox kernel as of kernel release 6.2.16-13-pve See: https://bugzilla.proxmox.com/show_bug.cgi?id=4707 https://forum.proxmox.com/threads/updating-upgrading-custom-patched-kernel.129384/#post-591947 7 | 8 | This fork has been amended to patch the required iommu source files using ``sed`` rather than ``patch``. This is achieved by using ``sed`` to amend the pve-kernel **Makefile** using several further ``sed`` commands to edit the iommu source file during the make process as this make process pulls the source files (chicken/egg problem.) 9 | 10 | The key ``sed`` commands can be found at: 11 | 12 | - [relax-intel-rmrr/patches/relaxable-rmrr-patch-sed.txt](patches/relaxable-rmrr-patch-sed.txt) 13 | - [relax-intel-rmrr/build/proxmox/build.sh#L157](build/proxmox/build.sh#L157) 14 | 15 | 16 | ## 🐧💨 Now you can use PCI passthrough on broken platforms 17 | 18 | ### TL;DR 19 | When you try to use PCI/PCIe passthrough in KVM/QEMU/Proxmox you get: 20 | ``` 21 | vfio-pci 0000:01:00.1: Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor. 22 | ``` 23 | followed by `vfio: failed to set iommu for container: Operation not permitted`. 24 | 25 | This kernel patch fixes the problem **on kernels v3.17 and up** (tested up to 5.9.1). You can skip to "[Installation](README.md#installation)" 26 | section if you don't care about the rest. Reading of "[Disclaimers](README.md#disclaimers)" section to understand the 27 | risks, and "[Solutions & hacks](deep-dive.md#other-solutions--hacks)" to get the idea of different alternatives is 28 | highly recommended. 29 | 30 | --- 31 | 32 | ### Table of Contents 33 | 1. [Installation](README.md#installation) 34 | - [Proxmox - premade packages](README.md#proxmox---premade-packages-easy) 35 | - [Docker - building from sources](README.md#docker---build-packages-from-sources-intermediate) 36 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources-advanced) 37 | - [Other distros](README.md#other-distros) 38 | 2. [Configuration](README.md#configuration) 39 | 3. [Deep Dive](deep-dive.md) - *a throughout research on the problem written for mortals* 40 | - [Technical details](deep-dive.md#technical-details) 41 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory) 42 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi) 43 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work) 44 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet) 45 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong) 46 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks) 47 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor) 48 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs) 49 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp) 50 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53) 51 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317) 52 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea) 53 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly) 54 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does) 55 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module) 56 | - [The future](deep-dive.md#the-future) 57 | 4. [Disclaimers](README.md#disclaimers) 58 | 5. [Acknowledgments & References](README.md#acknowledgments--references) 59 | 6. [License](README.md#license) 60 | 61 | --- 62 | 63 | ### Installation 64 | 65 | #### Proxmox - premade packages (easy) 66 | As I believe in *[eating your own dog food](https://en.wikipedia.org/wiki/Eating_your_own_dog_food)* I run the kernel 67 | described here. Thus, I publish precompiled packages. 68 | 69 | 1. Go to the [releases tab](https://github.com/Aterfax/relax-intel-rmrr/releases) and pick appropriate packages 70 | 2. Download `release.zip`, unzip it and `cd` down to the bottom of the directory tree. (You can copy links and use `wget https://...` and `unzip release.zip` on the server itself) 71 | 3. *(OPTIONAL)* Verify the release signature on ``release.zip`` as discussed here: https://github.com/Aterfax/relax-intel-rmrr/discussions/16 72 | 4. Install all using `dpkg -i *.deb` in the folder where you downloaded the debs 73 | 5. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version 74 | ending with `-pve-relaxablermrr` 75 | 6. [Configure the kernel](README.md#configuration) 76 | 77 | --- 78 | #### Docker - build packages from sources (intermediate) 79 | 80 | #### Prerequisites 81 | 1. Docker installed (tested on Ubuntu 22.04 & Debian 10). 82 | 2. ~40GB of free space. 83 | 3. Git clone of this repo (if building the image yourself.) 84 | 85 | #### Steps 86 | 87 | 1. (Optional) Build the container image yourself from the top level of the cloned repo (Dockerfile will be present): 88 | 89 | `docker build -t relaxable-rmrr-proxmox-kernel-builder .` 90 | 91 | 2. Run the Docker image with an appropriate host file system binding (you can just pull the image direct from DockerHub, adjust the command below to the correct image name if you are building yourself): 92 | 93 | `docker run --name relaxable-rmrr-proxmox-kernel-builder -v /mnt/scratch/proxmox-kernel-build-area/proxmox-kernel:/build/proxmox/proxmox-kernel -it aterfax/relaxable-rmrr-proxmox-kernel-builder:latest` 94 | 95 | 3. Wait until the build finishes (30 - 300 minutes depending on hardware used) and find the debs on your host file system path e.g. 96 | 97 | `/mnt/scratch/proxmox-kernel-build-area/proxmox-kernel/debs` 98 | 99 | 4. Now you can [install debs like you would premade packages](README.md#proxmox---premade-packages-easy). 100 | 101 | 5. [Configure the kernel](README.md#configuration) 102 | 103 | Note: If you want to build specific versions you can override the entrypoint from `bash -c "cd /build/proxmox/ && ./build_latest.sh"` to a script version of your choosing e.g. `bash -c "cd /build/proxmox/ && ./build7.1-10.sh"` 104 | 105 | 6. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.) 106 | 107 | --- 108 | 109 | #### Proxmox - building from sources (advanced) 110 | If you're running a version of Proxmox with [no packages available](README.md#proxmox---premade-packages-easy) you can 111 | [compile the kernel yourself using patches provided](build/proxmox/). 112 | 113 | --- 114 | 115 | #### Other distros 116 | 1. Download kernel sources appropriate for your distribution 117 | 2. Apply an appropriate patch to the source tree 118 | - Go to the folder with your kernel source 119 | - For Linux 3.17 - 5.7: `patch -p1 < ../patches/add-relaxable-rmrr-below-5_8.patch` 120 | - For Linux >=5.8: `patch -p1 < ../patches/add-relaxable-rmrr-5_8_and_up.patch` 121 | 3. Follow your distro kernel compilation & installation instruction: 122 | - [Debian](https://wiki.debian.org/BuildADebianKernelPackage) 123 | - [Ubuntu](https://wiki.ubuntu.com/Kernel/BuildYourOwnKernel) 124 | 125 | --- 126 | 127 | ### Configuration 128 | By default, after the kernel is installed, the patch will be *inactive* (i.e. the kernel will behave like this patch was 129 | never applied). To activate it you have to add `intel_iommu=relax_rmrr` to your Linux boot args. 130 | 131 | In most distros (including Proxmox) you do this by: 132 | 1. Opening `/etc/default/grub` (e.g. using `nano /etc/default/grub`) 133 | 2. Editing the `GRUB_CMDLINE_LINUX_DEFAULT` to include the option: 134 | - Example of old line: 135 | ``` 136 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on iommu=pt intremap=no_x2apic_optout" 137 | ``` 138 | - Example of new line: 139 | ``` 140 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on,relax_rmrr iommu=pt intremap=no_x2apic_optout" 141 | ``` 142 | - *Side note: these are actually options which will make your PCI passthrough work and do so efficiently* 143 | 3. If not running Proxmox, running the `update-grub` command and consulting your Linux distro's instructions on kernel pinning to pin your chosen kernel. 144 | 4. If using Proxmox, running the `proxmox-boot-tool kernel list` command to list your available kernels and then 'pinning' your chosen version with the `proxmox-boot-tool` e.g. `proxmox-boot-tool kernel pin 6.2.11-1-pve-relaxablermrr` 145 | 5. Making sure to take a note or making a calendar event to keep updating your kernels and repinning new releases! 146 | 6. Rebooting 147 | 148 | To verify if the the patch is active execute `dmesg | grep 'Intel-IOMMU'` after reboot. You should see a result similar 149 | to this: 150 | 151 | ``` 152 | root@sandbox:~# dmesg | grep 'Intel-IOMMU' 153 | [ 0.050195] DMAR: Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss 154 | root@sandbox:~# 155 | ``` 156 | 157 | --- 158 | 159 | ### Disclaimers 160 | - I'm not a kernel programmer by any means, so if I got something horribly wrong correct me please :) 161 | - This path should be safe, as long as you don't try to remap devices which are used by the IPMI/BIOS, e.g. 162 | - Network port shared between your IPMI and OS 163 | - RAID card in non-HBA mode with its driver loaded on the host 164 | - Network card with monitoring system installed on the host (e.g. [Intel Active Health System Agent](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229)) 165 | - This is not a supported solution by any of the vendors. In fact this is a direct violation of Intel's VT-d specs 166 | (which Linux already violates anyway, but this is increasing the scope). It may cause crashes or major instabilities. 167 | You've been warned. 168 | 169 | --- 170 | 171 | ### Acknowledgments & References 172 | - [Comment-out hack research by dschense](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675) 173 | - [Proxmox kernel compilation & patching by Feni](https://forum.proxmox.com/threads/compile-proxmox-ve-with-patched-intel-iommu-driver-to-remove-rmrr-check.36374/) 174 | - [Linux IOMMU Support](https://www.kernel.org/doc/html/latest/x86/intel-iommu.html) 175 | - [RedHat RMRR EXCLUSION Whitepaper](https://access.redhat.com/sites/default/files/attachments/rmrr-wp1.pdf) 176 | - [Intel® Virtualization Technology for Directed I/O (VT-d)](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html) 177 | - [Intel® Virtualization Technology for Directed I/O Architecture Specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html) 178 | 179 | --- 180 | 181 | ### License 182 | This work (patches & docs) is dual-licensed under MIT and GPL 2.0 (or any later version), which should be treated as an 183 | equivalent of Linux `Dual MIT/GPL` (i.e. pick a license you prefer). 184 | -------------------------------------------------------------------------------- /build/proxmox/README.md: -------------------------------------------------------------------------------- 1 | ## Proxmox - building from sources 2 | 3 | If you're running a version of Proxmox with [no packages available](../../README.md#proxmox---premade-packages-easy), or 4 | for some reason you don't/can't trust precompiled packages you can compile the kernel yourself using patches provided. 5 | 6 | The easiest way to do it is to clone this repository and use the build script provided, alongside this `README.md` file 7 | ([`build/proxmox/build_latest.sh`](build_latest.sh)) 8 | 9 | 10 | ### How to do it WITHOUT Docker? 11 | This is mostly intended if you want to build & run on your Proxmox host. Jump to [Docker-ized](README.md#how-to-do-it-with-docker) 12 | guide if you want to build packages in an isolated environment. 13 | 14 | #### Prerequisites 15 | 1. Proxmox 6/7 install (recommended) or Debian Buster/Bullseye *(it WILL fail on Ubuntu!)* 16 | 2. Root access. 17 | 3. ~40GB of free space. 18 | 19 | #### Steps 20 | 1. Clone the repo and `cd` to the `build/proxmox/` directory. 21 | 2. Run the [`build_latest.sh`](build.sh) script from terminal: 22 | `RMRR_AUTOINSTALL=1 bash ./build_latest.sh` 23 | *You can also manually execute commands in the script step-by-step. To facilitate that the script contains 24 | extensive comments for every step.* 25 | 26 | 3. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version 27 | ending with `-pve-relaxablermrr` 28 | 4. [Configure the kernel](../../README.md#configuration) 29 | 5. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.) 30 | 31 | This process will also leave precompiled `*.deb` packages, in case you want to copy them to other Proxmox hosts you have. 32 | 33 | --- 34 | 35 | ### How to do it WITH Docker? 36 | This is mostly intended for building packages for later use (and/or when you don't want to mess with your OS). 37 | 38 | #### Prerequisites 39 | 1. Docker installed (tested on Ubuntu 22.04 & Debian 10). 40 | 2. ~40GB of free space. 41 | 3. Git clone of this repo (if building the image yourself.) 42 | 43 | #### Steps 44 | 45 | 1. (Optional) Build the container image yourself from the top level of the cloned repo (Dockerfile will be present): 46 | 47 | `docker build -t relaxable-rmrr-proxmox-kernel-builder .` 48 | 49 | 2. Run the Docker image with an appropriate host file system binding (you can just pull the image direct from DockerHub, adjust the command below to the correct image name if you are building yourself): 50 | 51 | `docker run --name relaxable-rmrr-proxmox-kernel-builder -v /mnt/scratch/proxmox-kernel-build-area/proxmox-kernel:/build/proxmox/proxmox-kernel -it aterfax/relaxable-rmrr-proxmox-kernel-builder:latest` 52 | 53 | 3. Wait until the build finishes (30 - 300 minutes depending on hardware used) and find the debs on your host file system path e.g. 54 | 55 | `/mnt/scratch/proxmox-kernel-build-area/proxmox-kernel/debs` 56 | 57 | 4. Now you can [install debs like you would premade packages](../../README.md#proxmox---premade-packages-easy). 58 | 59 | 5. [Configure the kernel](../../README.md#configuration) 60 | 61 | Note: If you want to build specific versions you can override the entrypoint from `bash -c "cd /build/proxmox/ && ./build_latest.sh"` to a script version of your choosing e.g. `bash -c "cd /build/proxmox/ && ./build7.1-10.sh"` 62 | 63 | 6. Navigate to your `proxmox-kernel` directory and remove the build files to save space (if desired.) 64 | 65 | 66 | -------------------------------------------------------------------------------- /build/proxmox/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | : "${PVE_KERNEL_BRANCH:=master}" 5 | : "${RELAX_INTEL_GIT_REPO:=https://github.com/kiler129/relax-intel-rmrr.git}" 6 | : "${PROXMOX_PATCH:=proxmox.patch}" 7 | : "${RELAX_PATCH:=proxmox.patch}" 8 | 9 | echo '###########################################################' 10 | echo '################ Settings ################################' 11 | echo '###########################################################' 12 | 13 | echo "PVE_KERNEL_BRANCH:${PVE_KERNEL_BRANCH}" 14 | echo "RELAX_INTEL_GIT_REPO:${RELAX_INTEL_GIT_REPO}" 15 | echo "PROXMOX_PATCH:${PROXMOX_PATCH}" 16 | echo "RELAX_PATCH:${RELAX_PATCH}" 17 | 18 | 19 | ################################################################################# 20 | # This script is a part of https://github.com/kiler129/relax-intel-rmrr project # 21 | ################################################################################# 22 | 23 | 24 | echo '###########################################################' 25 | echo '############# STEP 0 - PERFORM SANITY CHECKS ##############' 26 | echo '###########################################################' 27 | # Make sure script is working in the directory it is located in 28 | cd "$(dirname "$(readlink -f "$0")")" 29 | SCRIPT_DIR=$(pwd) 30 | 31 | 32 | # Build process will fail if you're not a root (+ apt actions itself need it) 33 | if [[ "$EUID" -ne 0 ]] 34 | then echo "This script should be run bash root" 35 | exit 1 36 | fi 37 | 38 | # Sanity check: make sure no two builds are started nor we have something leftover from previous attempts 39 | if [[ -f "$SCRIPT_DIR/script_running" ]]; then 40 | echo "This script already appears to be running or has not cleaned up correctly." 41 | echo "To continue please remove $SCRIPT_DIR/script_running if you are sure a script is not already running." 42 | exit 1 43 | fi 44 | 45 | # Set the lockfile. 46 | touch $SCRIPT_DIR/script_running 47 | 48 | if [[ -d "proxmox-kernel" ]]; then 49 | 50 | echo 'Directory "proxmox-kernel" already exists.' 51 | cd proxmox-kernel 52 | 53 | echo "Cleaning debs previous dir if present." 54 | if [[ -d "debs" ]]; then rm -rf debs; fi 55 | 56 | if [[ -d "pve-kernel" ]]; then 57 | echo 'Directory "pve-kernel" already exists - resetting cloned Git repositories.' 58 | cd pve-kernel 59 | git clean -xfd 60 | git submodule foreach --recursive git clean -xfd 61 | git reset --hard 62 | git pull 63 | git checkout ${PVE_KERNEL_BRANCH} 64 | git submodule foreach --recursive git reset --hard 65 | git submodule update --init --recursive 66 | PVE_KERNEL_GIT_DIR_PRESENT=1 67 | cd .. 68 | fi 69 | 70 | if [[ -d "relax-intel-rmrr" ]]; then 71 | cd relax-intel-rmrr 72 | git reset --hard 73 | RELAX_INTEL_RMRR_GIT_DIR_PRESENT=1 74 | cd .. 75 | fi 76 | 77 | cd $SCRIPT_DIR 78 | 79 | fi 80 | 81 | 82 | echo '###########################################################' 83 | echo '############ STEP 1 - INSTALL ALL DEPENDENCIES ############' 84 | echo '###########################################################' 85 | # Check if Proxmox-specific package exists in apt cache. If it does it means apt already knows Proxmox repository, if 86 | # not we need to add it to properly build the kernel 87 | if apt show libpve-common-perl &>/dev/null; then 88 | echo "Step 1.0: Proxmox repository already present - not adding" 89 | else 90 | # Add Proxmox repo & their signing key 91 | echo "Step 1.0: Adding Proxmox apt repository..." 92 | apt -y update 93 | apt -y install gnupg 94 | # apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 7BF2812E8A6E88E0 95 | wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg 96 | echo 'deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription' > /etc/apt/sources.list.d/pve.list 97 | fi 98 | 99 | # Install all packages required to build the kernel & create *.deb packages for installation 100 | echo "Step 1.1: Installing build dependencies..." 101 | apt -y update 102 | apt -y install git nano screen patch fakeroot build-essential devscripts libncurses5 libncurses5-dev libssl-dev bc \ 103 | flex bison libelf-dev libaudit-dev libgtk2.0-dev libperl-dev asciidoc xmlto gnupg gnupg2 rsync lintian debhelper \ 104 | libdw-dev libnuma-dev libslang2-dev sphinx-common asciidoc-base automake cpio dh-python file gcc kmod libiberty-dev \ 105 | libpve-common-perl libtool perl-modules python3-minimal python3-dev sed tar zlib1g-dev lz4 curl zstd dwarves 106 | 107 | 108 | 109 | echo '###########################################################' 110 | echo '############ STEP 2 - DOWNLOAD CODE TO COMPILE ############' 111 | echo '###########################################################' 112 | # Create working directory 113 | echo "Step 2.0: Creating working directory" 114 | mkdir -p proxmox-kernel 115 | cd proxmox-kernel 116 | 117 | # Clone official Proxmox kernel repo & Relaxed RMRR Mapping patch if not already present. 118 | echo "Step 2.1: Downloading Proxmox kernel toolchain & patches" 119 | 120 | if [[ $PVE_KERNEL_GIT_DIR_PRESENT -ne 1 ]]; then 121 | git clone git://git.proxmox.com/git/pve-kernel.git 122 | fi 123 | 124 | if [[ $RELAX_INTEL_RMRR_GIT_DIR_PRESENT -ne 1 ]]; then 125 | git clone --depth=1 ${RELAX_INTEL_GIT_REPO} 126 | fi 127 | 128 | # Go to the actual Proxmox toolchain 129 | cd pve-kernel 130 | 131 | #Checkout the correct branch 132 | git checkout ${PVE_KERNEL_BRANCH} 133 | 134 | echo "Showing Git status:" 135 | git status 136 | 137 | # (OPTIONAL) Download flat copy of Ubuntu hirsute kernel submodule 138 | # If you skip this the "make" of Proxmox kernel toolchain will download a copy (a Proxmox kernel is based on Ubuntu 139 | # If you skip this the "make" of Proxmox kernel toolchain will download a copy (a Proxmox kernel is based on Ubuntu 140 | # hirsute kernel). However, it will download it with the whole history etc which takes A LOT of space (and time). This 141 | # bypasses the process safely. 142 | # This curl skips certificate validation because Proxmox GIT WebUI doesn't send Let's Encrypt intermediate cert 143 | echo "Step 2.2: Downloading base kernel" 144 | #TODO: This needs a proxmox7 fix 145 | # curl -k "https://git.proxmox.com/?p=mirror_ubuntu-hirsute-kernel.git;a=snapshot;h=$(git submodule status submodules/ubuntu-hirsute | cut -c 2-41);sf=tgz" --output kernel.tgz 146 | # tar -xf kernel.tgz -C submodules/ubuntu-hirsute/ --strip 1 147 | # rm kernel.tgz 148 | 149 | 150 | 151 | echo '###########################################################' 152 | echo '################# STEP 3 - CREATE KERNEL ##################' 153 | echo '###########################################################' 154 | echo "Step 3.0: Applying patches" 155 | #cp ../relax-intel-rmrr/patches/${RELAX_PATCH} ./patches/kernel/CUSTOM-add-relaxable-rmrr.patch 156 | #cp ../relax-intel-rmrr/patches/relaxable-rmrr-patch-sed.txt ./patches/kernel/ 157 | sed -i '/^$(KERNEL_SRC).prepared: $(KERNEL_SRC_SUBMODULE) | submodule/r ../../../../patches/relaxable-rmrr-patch-sed.txt' Makefile 158 | patch -p1 < ../relax-intel-rmrr/patches/${PROXMOX_PATCH} 159 | 160 | 161 | echo "Step 3.1: Compiling kernel... (it will take 30m-3h)" 162 | # Note: DO NOT add -j to this make, see https://github.com/kiler129/relax-intel-rmrr/issues/1 163 | # This step will compile kernel & build all *.deb packages as Proxmox builds internally 164 | make clean 165 | make 166 | 167 | 168 | echo '###########################################################' 169 | echo '################ STEP 4 - INSTALL KERNEL ##################' 170 | echo '###########################################################' 171 | echo "Step 4: Installing packages" 172 | 173 | if [[ -v RMRR_AUTOINSTALL ]]; then 174 | apt install ./*.deb 175 | else 176 | echo '=====>>>> SKIPPED - to enable autoinstallation set "RMRR_AUTOINSTALL" environment variable.' 177 | echo '=====>>>> To install execute "dpkg -i *.deb" after this script finishes' 178 | fi 179 | 180 | echo '###########################################################' 181 | echo '################## STEP 5 - CLEANUP #######################' 182 | echo '###########################################################' 183 | # Remove all (~30GB) of stuff leftover after compilation 184 | echo "Step 5: Cleaning up..." 185 | cd .. 186 | mkdir -p $SCRIPT_DIR/proxmox-kernel/debs 187 | mv pve-kernel/*.deb $SCRIPT_DIR/proxmox-kernel/debs 188 | #rm -rf pve-kernel 189 | #rm -rf relax-intel-rmrr 190 | 191 | # Remove the lockfile. 192 | rm $SCRIPT_DIR/script_running 193 | exit 0 194 | -------------------------------------------------------------------------------- /build/proxmox/build7.1-10.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | export PVE_KERNEL_BRANCH=pve-kernel-5.13 5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git" 6 | export RELAX_PATCH="add-relaxable-rmrr-5_13.patch" 7 | export PROXMOX_PATCH="proxmox7.patch" 8 | 9 | ./build.sh 10 | -------------------------------------------------------------------------------- /build/proxmox/build7.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | export PVE_KERNEL_BRANCH=pve-kernel-5.13 5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git" 6 | export RELAX_PATCH="add-relaxable-rmrr-5_13.patch" 7 | export PROXMOX_PATCH="proxmox7.patch" 8 | 9 | ./build.sh 10 | -------------------------------------------------------------------------------- /build/proxmox/build_latest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | export PVE_KERNEL_BRANCH=master 5 | export RELAX_INTEL_GIT_REPO="https://github.com/Aterfax/relax-intel-rmrr.git" 6 | export RELAX_PATCH="add-relaxable-rmrr-5_15.patch" 7 | export PROXMOX_PATCH="proxmox7.patch" 8 | 9 | ./build.sh 10 | -------------------------------------------------------------------------------- /build/proxmox/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | build-cont: 4 | image: build-img 5 | build: . 6 | stdin_open: true 7 | tty: true 8 | volumes: 9 | - ./debs:/build/proxmox-kernel/debs -------------------------------------------------------------------------------- /deep-dive.md: -------------------------------------------------------------------------------- 1 | ### Deep Dive into the problem 2 | 3 | ### Table of Contents 4 | 1. [Installation](README.md#installation) 5 | - [Proxmox - premade packages](README.md#proxmox---premade-packages) 6 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources) 7 | - [Other distros](README.md#other-distros) 8 | 2. [Configuration](README.md#configuration) 9 | 3. **Deep Dive** <= you're here 10 | - [Technical details](deep-dive.md#technical-details) 11 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory) 12 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi) 13 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work) 14 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet) 15 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong) 16 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks) 17 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor) 18 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs) 19 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp) 20 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53) 21 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317) 22 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea) 23 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly) 24 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does) 25 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module) 26 | - [The future](deep-dive.md#the-future) 27 | 4. [Disclaimers](README.md#disclaimers) 28 | 5. [Acknowledgments & References](README.md#acknowledgments--references) 29 | 6. [License](README.md#license) 30 | 31 | --- 32 | 33 | ### Technical details 34 | 35 | #### How virtual machines use memory? 36 | To understand PCI passthrough we first need to understand how VMs work. Each VM launched in the system gets a new 37 | virtual address space and has no direct access to the host memory. Yet, the guest OS runs like it was running with a 38 | real RAM, using any memory addresses it wants. In other words the guest OS has no idea (in terms of memory) that it is 39 | being virtualized. Logically there has to be some map to translate guest OS requests to the real memory addresses, since 40 | multiple guest OSes has to share the same physical host memory. The hypervisor (host OS) is responsible for maintaining 41 | a map between GPA (Guest Address Space) and HPA (Host Physical Address). To better understand this look at the (VERY 42 | simplified) graphics: 43 | 44 | ``` 45 | +--------------------------------HOST----------------------------------------+ 46 | | | 47 | | +--------------------------HOST MEMORY-------------------------------+ | 48 | | | +-------+ +----------GUEST MEMORY-----------+ | | 49 | | | | vim | |---------------------------------| | | 50 | | | | mem | |---------------------------------| | | 51 | | | +-------+ +---------------------------------+ | | 52 | | | 0xA000 0xA100 | | 53 | | +--------------------------------------------------------------------+ | 54 | | 0x0000 0xF000 0xF0FF 0x....| 55 | | | 56 | | +--------+ +----------------GUEST VM------------------+ | 57 | | | | | +------------GUEST MEMORY--------------+ | | 58 | | | vim | | | | | | | | 59 | | | | | | guest kernel| wget | | | | 60 | | +--------+ | | | mem | | | | 61 | | | +-------------+--------+---------------+ | | 62 | | | 0x00 0x1E 0x20 0xFF | | 63 | | | +------+ | | 64 | | | | wget | | | 65 | | | +------+ | | 66 | | +------------------------------------------+ | 67 | +----------------------------------------------------------------------------+ 68 | 69 | (addresses don't represent real x86 space[!] and are not drawn to scale) 70 | ``` 71 | 72 | When a VM is run the hypervisor gives it a predetermined amount of memory and tells the gust OS that it has a contagious 73 | space of 255 bytes. The guest OS knows it can use 255 bytes from 0x00 and doesn't care/know where this memory physically 74 | resides. Host OS now needs to find space for 255 bytes, either in one or multiple chunks in the physical memory. It can 75 | map it as on the diagram to one big chunk or split it into multiple ones, as long as it can map guest request for its 76 | `0x1E`-`0x20` to e.g. `0xF010`-`0xF012` and return the data. 77 | 78 | --- 79 | 80 | #### Why do we need VT-d / AMD-Vi? 81 | While mapping the memory (as described in the previous section) the host OS must take care of three things: 82 | 1. When guest OS requests a page from memory using its (GPA) address it will get it from the HPA-addressed memory (=mapping) 83 | 2. Memory of the guest cannot be touched by anything other than the guest (=protection) 84 | 3. The process needs to be fast 85 | 86 | While the first two are achievable with pure software emulation, it makes the memory access process slow as molasses 87 | since it can no longer rely on [DMA](https://en.wikipedia.org/wiki/Direct_memory_access) but involve CPU for every 88 | shifting bytes back and forth. 89 | Both VT-d and AMD-Vi allow to essentially instruct the hardware to do the mapping and enforce domains (security 90 | boundaries). In such case host OS simply needs to inform the hardware about the address to be translated on-the-fly. 91 | 92 | More on that can be found in the [Intel VT-d docs](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html). 93 | 94 | --- 95 | 96 | #### How PCI/PCIe actually work? 97 | Most people blindly plop `intel_iommu=on` and `iommu=pt` into their kernel line and get surprised when things don't 98 | work. I did too, so I started digging, which resulted in this whole repository. 99 | 100 | Every device in the system has some memory reserved memory address space. It's used by the device and the the host 101 | system to communicate and exchange data. That reserved memory address is dictated by the firmware (i.e. BIOS) as both 102 | the device and OS must know it to communicate. In essence this is just slightly different than normal memory mapping. 103 | Here, you don't have just some OS using the memory but an OS **and** a device using the memory. 104 | 105 | Here's where [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) comes into play. In essence it's 106 | able to remap GPA to HPA for both the OS and the device so that they can talk to each other. When device memory is 107 | remapped the guest OS talks to the hardware like it was really under some physical address it expects, while in reality 108 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) moves the reserved region aperture 109 | somewhere else in the address space. This is *usually* fine. 110 | 111 | --- 112 | 113 | #### RMRR - the monster in a closet 114 | While both AMD and Intel allow for [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) remapping 115 | device's memory, Intel had an idea to introduce RMRR (Reserved Memory Region Reporting). In essence the firmware/BIOS 116 | publishes a list of regions where usage of [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) is 117 | ostensibly prohibited. The original intent for that feature was good, by allowing for USB keyboards to be automagically 118 | emulated by the USB controller itself before USB driver is loaded, like they were connected via PS/2. This also allow 119 | the GPU to display the picture before OS is loaded and even before [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 120 | is initialized. 121 | However, it required some sacrifices: that memory should not be remapped as only OS and the device use the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 122 | and devices on the motherboard which may be communicating with e.g. the GPU pre-boot don't know anything about the 123 | mapping. 124 | 125 | However, one *undocumented assumption* was made: as soon as the driver is loaded the "out-of-band" access to the device 126 | ends and the the OS takes over. However, *technically* the VT-d specification says that the RMRR is valid indefinitely. 127 | 128 | Linux for long time (up until [v3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee)) 129 | didn't respect RMRR while setting up [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 130 | resptcing that against-the-specs but ubiquitous assumption. This was an oversight as [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 131 | API assumes exclusive control over the remapped address space. If such space is remapped the DMA access from outside of 132 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) domain (i.e. from something else than the 133 | host or VM guest OS, like a device on the motherboard) will fail which may lead to unpredictable results if the hardware 134 | vendor didn't follow the *undocumented assumption*. 135 | 136 | 137 | Linux, as of now, excludes two specific classes of devices form being constricted by RMRR: 138 | - USB devices (as we historically trust they don't do weird things) 139 | - GPUs (unspoken rule that they're accessed out-of-band only before the driver loads) 140 | 141 | 142 | RMRR *by itself* isn't evil, as long as it's used as [Intel's VT-d specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html) 143 | intended - "*[RMRRs] that are either not DMA targets, or memory ranges that may be target of BIOS 144 | initiated DMA only during pre-boot phase (such as from a boot disk drive) **must not** be included in the reserved 145 | memory region reporting.*". 146 | 147 | 148 | Intel anticipated the some will be tempted to misuse the feature as they warned in the VT-d specification: "*RMRR 149 | regions are expected to be used for legacy usages (...). Platform designers should avoid or limit use of reserved memory 150 | regions*". 151 | 152 | ---- 153 | 154 | #### What vendors did wrong? 155 | HP (and probably others) decided to mark **every freaking PCI device memory space as RMRR!**`*` Like that, 156 | just in case... just that their tools could potentially maybe monitor these devices while OS agent is not installed. But 157 | wait, there's more! They marked **ALL** devices as such, even third party ones physically installed in motherboard's 158 | PCI/PCIe slots! 159 | 160 | This in turn killed PCI passthrough for any of the devices in systems running Linux [>=3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee). 161 | 162 | *`*` In case you skipped other sections above, RMRR is a special part of the memory which cannot be moved 163 | to a VM.* 164 | 165 | --- 166 | 167 | ### Other solutions & hacks 168 | 169 | #### Contact your platform vendor 170 | As the error suggests you can try to convince your vendor to fix the BIOS. If you do please create an issue in this repo 171 | to tell me about it, as this is **the only** real solution to the problem. 172 | 173 | --- 174 | 175 | #### Use OS which ignores RMRRs 176 | Some operating systems, notably [VMWare ESXi and vSphere](https://www.vmware.com/products/esxi-and-esx.html), are 177 | believed to ignore RMRRs (cannot be verified as they're closed-source). They're able to passthrough the devices without 178 | a problem, as long as you don't do something deliberately dangerous (see [Disclaimers](README.md#disclaimers)). 179 | 180 | --- 181 | 182 | #### Attempt HPE's pseudofix (if you use HP) 183 | To HPE's credit, they [recognized the problem and released an advisory with mitigations](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229). 184 | In short the HPE's solution is threefold: 185 | 1. Fix the firmware to not include GPUs in RMRR 186 | 2. Use System Configuration utility on Gen9+ servers to disable "HP Shared Memory features" on selected HPs cards 187 | 3. Use their CLI BIOS/RBSU reconfiguration utility to set a special (invisible in menus) flags opting-out PCIe slots 188 | from "smart monitoring" 189 | 190 | However, we wouldn't be here if it actually worked as expected: 191 | - Fix 1 works only on GPUs and affects Linux 3.17-5.4 (as kernel has GPU exclusion since 5.4) 192 | - Fix 2 only works on *some* **external** HPE ethernet adapters with Gen9 and newer servers 193 | - Fix 3 theoretically works on all NICs, but not other cards (e.g. HBAs) and [doesn't actually work](https://community.hpe.com/t5/proliant-servers-netservers/microserver-gen8-quot-device-is-ineligible-for-iommu-domain/td-p/6947461#.X5D7SS9h1TY) 194 | (sic!) on some servers which are listed as affected (e.g. widely popular [HP/HPE Microserver Gen8](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c03793258)) 195 | 196 | Some tried [opening a support case](https://community.hpe.com/t5/proliant-servers-netservers/re-device-is-ineligible-for-iommu-domain-attach-due-to-platform/m-p/6817728/highlight/true#M21006) 197 | but the topic dried out. I tried [nagging HPE to fix the BIOS](https://community.hpe.com/t5/proliant-servers-ml-dl-sl/disabling-rmrds-rmrr-hp-shared-memory-features-on-microserver/td-p/7105623#.X5C0oy9h2uV). 198 | Maybe there's a chance? Who knows... the future will show. 199 | 200 | --- 201 | 202 | #### The comment-the-error-out hack (v3.17 - 5.3) 203 | I was able to track the first mentions of this method to [a post by dschense on a German Proxmox forum](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675) 204 | ([en version](https://translate.googleusercontent.com/translate_c?depth=2&pto=aue&rurl=translate.google.com&sl=de&tl=en&u=https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)). 205 | 206 | In essence this was a logical conclusion: if you have an error comment it out and see what happens. It worked on the 207 | original protection being introduced in Linux v3.17. Unfortunately, the Linux v5.3 changed a lot (see [next section](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)). 208 | 209 | --- 210 | 211 | #### Long-term solution - utilizing relaxable reservation regions (>=3.17) 212 | 213 | ##### Why commenting-out the error is a bad idea 214 | Before Linux v5.3 RMRRs protection relied on [a simple patch introduced in v3.17](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee) 215 | which excluded USB devices. [Commenting out the error](#the-comment-the-error-out-hack-v317---53) was a working 216 | solution, as the kernel (including KVM subsystem) didn't care about the reserved regions. 217 | 218 | The situation changed dramatically. A large change aimed to [introduce IOVA list management](https://patchwork.kernel.org/project/kvm/cover/20190723160637.8384-1-shameerali.kolothum.thodi@huawei.com/) 219 | outside of the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) driver was introduced. About 220 | the same time the RMRRs reserved memory [was split into two logical buckets](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12): 221 | absolutely-reserved (`IOMMU_RESV_DIRECT`) and so-called relaxed (`IOMMU_RESV_DIRECT_RELAXABLE`). USB devices and now 222 | GPUs were marked as *"relaxable"* as they were deemed safe to be remapped (even if against the VT-d specs and 223 | firmware's will). 224 | 225 | 226 | ##### The kernel moves on quickly 227 | Other subsystems naturally [started utilizing](https://github.com/torvalds/linux/commit/9b77e5c79840fc334a5b7f770c5ab0c09dc0e028) 228 | that new IOVA interface, which broke the *"[comment-the-error-out](#the-comment-the-error-out-hack-v317---53)"* patch. 229 | Now with the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) error message commented out QEMU 230 | [will explode on vfio_dma_map()](https://bugs.launchpad.net/qemu/+bug/1869006/comments/14). 231 | Understandably, and for good reasons, [developers refuses to accommodate any requests to disable that](https://bugs.launchpad.net/qemu/+bug/1869006/comments/18). 232 | While even more checks can be commented-out and patched, as more subsystems in the kernel start relying on the IOVA 233 | lists management, it will be a cat-and-mouse game after every kernel release. 234 | 235 | 236 | ##### What this patch actually does 237 | The path plugs into the same mechanism as the vanilla kernel used to [mark USB and GPUs as "relaxable"](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12). 238 | This has three benefits: 239 | - The RMRR is not fully NULLified, as the memory is marked as reserved-with-exceptions and not just not reserved. This, 240 | combined with IOVA list management ensures that if some code somewhere needs to work differently with relaxable 241 | devices it will work with this patch properly. 242 | - This patch doesn't introduce inconsistent state in the kernel. RMRRs are not hidden from the kernel by removal, nor 243 | ignored just in one place. This patch just changes the designation of these regions from `IOMMU_RESV_DIRECT` (*"we 244 | know it's reserved and we will hold your hand"*) to [`IOMMU_RESV_DIRECT_RELAXABLE`](https://lore.kernel.org/patchwork/patch/1079954/) 245 | (*"we know it's reserved but it's your playground"*). 246 | - It works across all affected kernels (v5.9.1 being the newest at the time of writing) 247 | 248 | Additionally, this mechanism is [controllable with a boot option](README.md#configuration) making it safe and easy to 249 | disable as needed. 250 | 251 | 252 | ##### Why kernel patch and not a loadable module? 253 | Before taking this approach I poked around to see if the [IOMM driver](https://github.com/torvalds/linux/tree/master/drivers/iommu/intel) 254 | has any API around RMRR. It does not. The driver doesn't export any functions which can make the module feasible. 255 | While Linux >=5.3 has the IOVA list management interface, it is [being built by the Intel IOMMU driver](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12). 256 | What it means is the hardcoded relaxable logic [decides about IOVA designation](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12#diff-e1fff7a2368c04e11696812359f854de9da431c63ec7c5a7bec8f6020e112a2aR2916). 257 | Late on the same logic is [used for final sanity](https://github.com/torvalds/linux/blob/5f9e832c137075045d15cd6899ab0505cfb2ca4b/drivers/iommu/intel-iommu.c#L5057) 258 | independently from the state of the memory saved in the IOVA list. Only after this check passes the IOMMU mapping is 259 | added. 260 | 261 | In other words even if >=5.4 [IOVA API is used to modify](https://github.com/torvalds/linux/commit/af029169b8fdae31064624d60b5469a3da95ad32) 262 | the assignment, the actual IOMU remapping will fail with *"Device is ineligible for IOMMU domain attach..."* error. 263 | 264 | 265 | #### The future 266 | It will be great if this patch could be upstreamed. However, I see slim-to-none chance of that happening, as this change 267 | is prone to abuse. However, I will definitely try to communicate with kernel folks on how to proceed. 268 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_11.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2021-07-30 16:21:22.235520365 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2021-07-30 16:28:28.905719413 +0100 3 | @@ -355,6 +355,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -455,7 +456,10 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | - } 16 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 17 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 18 | + intel_relaxable_rmrr = 1; 19 | + } 20 | 21 | str += strcspn(str, ","); 22 | while (*str == ',') 23 | @@ -2802,7 +2806,7 @@ 24 | return false; 25 | 26 | pdev = to_pci_dev(dev); 27 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 28 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 29 | return true; 30 | else 31 | return false; 32 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_13.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2022-02-26 13:51:33.821885509 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-26 13:58:27.231463792 +0100 3 | @@ -364,6 +364,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -465,6 +466,9 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } else { 19 | pr_notice("Unknown option - '%s'\n", str); 20 | } 21 | @@ -2846,7 +2850,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_15.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2022-05-04 18:50:13.078092713 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2022-05-04 18:45:09.909672434 +0100 3 | @@ -345,6 +345,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -450,6 +451,9 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } else { 19 | pr_notice("Unknown option - '%s'\n", str); 20 | } 21 | @@ -2832,7 +2836,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_8_and_up.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2 | +++ b/drivers/iommu/intel/iommu.c 3 | @@ -356,6 +356,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int intel_no_bounce; 7 | +static int intel_relaxable_rmrr = 0; 8 | static int iommu_skip_te_disable; 9 | 10 | #define IDENTMAP_GFX 2 11 | @@ -463,6 +464,9 @@ 12 | } else if (!strncmp(str, "nobounce", 8)) { 13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 14 | intel_no_bounce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } 19 | 20 | str += strcspn(str, ","); 21 | @@ -2863,7 +2867,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | 31 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-below-5_8.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel-iommu.c 2 | +++ b/drivers/iommu/intel-iommu.c 3 | @@ -367,6 +367,7 @@ static int intel_iommu_strict; 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int intel_no_bounce; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_ALL 1 10 | #define IDENTMAP_GFX 2 11 | @@ -468,6 +469,9 @@ static int __init intel_iommu_setup(char *str) 12 | } else if (!strncmp(str, "nobounce", 8)) { 13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 14 | intel_no_bounce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } 19 | 20 | str += strcspn(str, ","); 21 | @@ -2866,7 +2870,7 @@ static bool device_rmrr_is_relaxable(struct device *dev) 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /patches/proxmox.patch: -------------------------------------------------------------------------------- 1 | --- a/Makefile 2 | +++ b/Makefile 3 | @@ -11,7 +11,7 @@ 4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN) 5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL) 6 | 7 | -EXTRAVERSION=-${KREL}-pve 8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr 9 | KVNAME=${KERNEL_VER}${EXTRAVERSION} 10 | PACKAGE=pve-kernel-${KVNAME} 11 | HDRPACKAGE=pve-headers-${KVNAME} 12 | -------------------------------------------------------------------------------- /patches/proxmox7.patch: -------------------------------------------------------------------------------- 1 | --- a/Makefile 2 | +++ b/Makefile 3 | @@ -13,7 +13,7 @@ PKGREL=1 4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN) 5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL) 6 | 7 | -EXTRAVERSION=-$(KREL)-pve 8 | +EXTRAVERSION=-$(KREL)-pve-relaxablermrr 9 | KVNAME=$(KERNEL_VER)$(EXTRAVERSION) 10 | PACKAGE=pve-kernel-$(KVNAME) 11 | HDRPACKAGE=pve-headers-$(KVNAME) 12 | -------------------------------------------------------------------------------- /patches/relaxable-rmrr-patch-sed.txt: -------------------------------------------------------------------------------- 1 | sed -i '/^static int iommu_skip_te_disable;.*/a static int intel_relaxable_rmrr = 0;' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c 2 | sed -i 's/if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))/if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))/g' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c 3 | sed -i '/intel_iommu_tboot_noforce = 1;/a \\ \ } else if (!strncmp(str, "relax_rmrr", 10)) {\n\ \ \ pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\\n");\n\ \ \ intel_relaxable_rmrr = 1;' ${KERNEL_SRC_SUBMODULE}/drivers/iommu/intel/iommu.c 4 | --------------------------------------------------------------------------------