├── .gitignore ├── patches ├── proxmox.patch ├── proxmox7.patch ├── add-relaxable-rmrr-5_8_and_up.patch ├── add-relaxable-rmrr-5_11.patch ├── add-relaxable-rmrr-below-5_8.patch ├── add-relaxable-rmrr-5_13.patch └── add-relaxable-rmrr-5_15.patch ├── README.md └── deep-dive.md /.gitignore: -------------------------------------------------------------------------------- 1 | proxmox-kernel/ 2 | -------------------------------------------------------------------------------- /patches/proxmox.patch: -------------------------------------------------------------------------------- 1 | --- a/Makefile 2 | +++ b/Makefile 3 | @@ -11,7 +11,7 @@ 4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN) 5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL) 6 | 7 | -EXTRAVERSION=-${KREL}-pve 8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr 9 | KVNAME=${KERNEL_VER}${EXTRAVERSION} 10 | PACKAGE=pve-kernel-${KVNAME} 11 | HDRPACKAGE=pve-headers-${KVNAME} 12 | --- a/debian/scripts/find-firmware.pl 13 | +++ b/debian/scripts/find-firmware.pl 14 | @@ -8,7 +8,7 @@ 15 | 16 | die "no such directory" if ! -d $dir; 17 | 18 | -die "strange directory name: $dir" if $dir !~ m|^(.*/)?(5.\d.\d+\-\d+\-pve)(/+)?$|; 19 | +#die "strange directory name: $dir" if $dir !~ m|^(.*/)?(5.\d.\d+\-\d+\-pve)(/+)?$|; 20 | 21 | my $apiver = $2; 22 | -------------------------------------------------------------------------------- /patches/proxmox7.patch: -------------------------------------------------------------------------------- 1 | --- a/Makefile 2 | +++ b/Makefile 3 | @@ -11,7 +11,7 @@ 4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN) 5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL) 6 | 7 | -EXTRAVERSION=-${KREL}-pve 8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr 9 | KVNAME=${KERNEL_VER}${EXTRAVERSION} 10 | PACKAGE=pve-kernel-${KVNAME} 11 | HDRPACKAGE=pve-headers-${KVNAME} 12 | --- a/debian/scripts/find-firmware.pl 13 | +++ b/debian/scripts/find-firmware.pl 14 | @@ -8,7 +8,7 @@ 15 | 16 | die "no such directory" if ! -d $dir; 17 | 18 | -die "strange directory name: $dir" if $dir !~ m|^(.*/)?(\d+.\d+.\d+\-\d+\-pve)(/+)?$|; 19 | +#die "strange directory name: $dir" if $dir !~ m|^(.*/)?(\d+.\d+.\d+\-\d+\-pve)(/+)?$|; 20 | 21 | 22 | my $apiver = $2; 23 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_8_and_up.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2 | +++ b/drivers/iommu/intel/iommu.c 3 | @@ -356,6 +356,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int intel_no_bounce; 7 | +static int intel_relaxable_rmrr = 0; 8 | static int iommu_skip_te_disable; 9 | 10 | #define IDENTMAP_GFX 2 11 | @@ -463,6 +464,9 @@ 12 | } else if (!strncmp(str, "nobounce", 8)) { 13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 14 | intel_no_bounce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } 19 | 20 | str += strcspn(str, ","); 21 | @@ -2863,7 +2867,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | 31 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_11.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2021-07-30 16:21:22.235520365 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2021-07-30 16:28:28.905719413 +0100 3 | @@ -355,6 +355,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -455,7 +456,10 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | - } 16 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 17 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 18 | + intel_relaxable_rmrr = 1; 19 | + } 20 | 21 | str += strcspn(str, ","); 22 | while (*str == ',') 23 | @@ -2802,7 +2806,7 @@ 24 | return false; 25 | 26 | pdev = to_pci_dev(dev); 27 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 28 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 29 | return true; 30 | else 31 | return false; 32 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-below-5_8.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel-iommu.c 2 | +++ b/drivers/iommu/intel-iommu.c 3 | @@ -367,6 +367,7 @@ static int intel_iommu_strict; 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int intel_no_bounce; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_ALL 1 10 | #define IDENTMAP_GFX 2 11 | @@ -468,6 +469,9 @@ static int __init intel_iommu_setup(char *str) 12 | } else if (!strncmp(str, "nobounce", 8)) { 13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 14 | intel_no_bounce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } 19 | 20 | str += strcspn(str, ","); 21 | @@ -2866,7 +2870,7 @@ static bool device_rmrr_is_relaxable(struct device *dev) 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_13.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2022-02-26 13:51:33.821885509 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-26 13:58:27.231463792 +0100 3 | @@ -364,6 +364,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -465,6 +466,9 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } else { 19 | pr_notice("Unknown option - '%s'\n", str); 20 | } 21 | @@ -2846,7 +2850,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /patches/add-relaxable-rmrr-5_15.patch: -------------------------------------------------------------------------------- 1 | --- a/drivers/iommu/intel/iommu.c 2022-02-27 12:02:53.958814198 +0100 2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-27 12:03:07.402842983 +0100 3 | @@ -338,6 +338,7 @@ 4 | static int intel_iommu_superpage = 1; 5 | static int iommu_identity_mapping; 6 | static int iommu_skip_te_disable; 7 | +static int intel_relaxable_rmrr = 0; 8 | 9 | #define IDENTMAP_GFX 2 10 | #define IDENTMAP_AZALIA 4 11 | @@ -442,6 +443,9 @@ 12 | } else if (!strncmp(str, "tboot_noforce", 13)) { 13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 14 | intel_iommu_tboot_noforce = 1; 15 | + } else if (!strncmp(str, "relax_rmrr", 10)) { 16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n"); 17 | + intel_relaxable_rmrr = 1; 18 | } else { 19 | pr_notice("Unknown option - '%s'\n", str); 20 | } 21 | @@ -2824,7 +2828,7 @@ 22 | return false; 23 | 24 | pdev = to_pci_dev(dev); 25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 27 | return true; 28 | else 29 | return false; 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🍻 Relaxed RMRR Mapping for Linux 3.17+ 2 | ## 🐧💨 Now you can use PCI passthrough on broken platforms 3 | 4 | ### TL;DR 5 | When you try to use PCI/PCIe passthrough in KVM/QEMU/Proxmox you get: 6 | ``` 7 | vfio-pci 0000:01:00.1: Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor. 8 | ``` 9 | followed by `vfio: failed to set iommu for container: Operation not permitted`. 10 | 11 | This kernel patch fixes the problem **on kernels v3.17 and up** (tested up to 5.9.1). You can skip to "[Installation](README.md#installation)" 12 | section if you don't care about the rest. Reading of "[Disclaimers](README.md#disclaimers)" section to understand the 13 | risks, and "[Solutions & hacks](deep-dive.md#other-solutions--hacks)" to get the idea of different alternatives is 14 | highly recommended. 15 | 16 | --- 17 | 18 | ### Table of Contents 19 | 1. [Installation](README.md#installation) 20 | - [Proxmox - premade packages](README.md#proxmox---premade-packages-easy) 21 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources-advanced) 22 | - [Other distros](README.md#other-distros) 23 | 2. [Configuration](README.md#configuration) 24 | 3. [Deep Dive](deep-dive.md) - *a throughout research on the problem written for mortals* 25 | - [Technical details](deep-dive.md#technical-details) 26 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory) 27 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi) 28 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work) 29 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet) 30 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong) 31 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks) 32 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor) 33 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs) 34 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp) 35 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53) 36 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317) 37 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea) 38 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly) 39 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does) 40 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module) 41 | - [The future](deep-dive.md#the-future) 42 | 4. [Disclaimers](README.md#disclaimers) 43 | 5. [Acknowledgments & References](README.md#acknowledgments--references) 44 | 6. [License](README.md#license) 45 | 46 | --- 47 | 48 | ### Installation 49 | 50 | #### Proxmox - premade packages (easy) 51 | As I believe in *[eating your own dog food](https://en.wikipedia.org/wiki/Eating_your_own_dog_food)* I run the kernel 52 | described here. Thus, I publish precompiled packages. 53 | 54 | 1. Go to the [releases tab](https://github.com/kiler129/relax-intel-rmrr/releases/) and pick appropriate packages 55 | 2. Download all `*.deb`s packages to the server (you can copy links and use `wget https://...` on the server itself) 56 | 3. Install all using `dpkg -i *.deb` in the folder where you downloaded the debs 57 | 4. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version 58 | ending with `-pve-relaxablermrr` 59 | 5. [Configure the kernel](README.md#configuration) 60 | 61 | --- 62 | 63 | #### Proxmox - building from sources (advanced) 64 | If you're running a version of Proxmox with [no packages available](README.md#proxmox---premade-packages-easy) you can 65 | [compile the kernel yourself using patches provided](build/proxmox/). 66 | 67 | --- 68 | 69 | #### Other distros 70 | 1. Download kernel sources appropriate for your distribution 71 | 2. Apply an appropriate patch to the source tree 72 | - Go to the folder with your kernel source 73 | - For Linux 3.17 - 5.7: `patch -p1 < ../patches/add-relaxable-rmrr-below-5_8.patch` 74 | - For Linux >=5.8: `patch -p1 < ../patches/add-relaxable-rmrr-5_8_and_up.patch` 75 | 3. Follow your distro kernel compilation & installation instruction: 76 | - [Debian](https://wiki.debian.org/BuildADebianKernelPackage) 77 | - [Ubuntu](https://wiki.ubuntu.com/Kernel/BuildYourOwnKernel) 78 | 79 | ***TODO:*** *Add automation script* 80 | 81 | --- 82 | 83 | ### Configuration 84 | By default, after the kernel is installed, the patch will be *inactive* (i.e. the kernel will behave like this patch was 85 | never applied). To activate it you have to add `intel_iommu=relax_rmrr` to your Linux boot args. 86 | 87 | In most distros (including Proxmox) you do this by: 88 | 1. Opening `/etc/default/grub` (e.g. using `nano /etc/default/grub`) 89 | 2. Editing the `GRUB_CMDLINE_LINUX_DEFAULT` to include the option: 90 | - Example of old line: 91 | ``` 92 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on iommu=pt intremap=no_x2apic_optout" 93 | ``` 94 | - Example of new line: 95 | ``` 96 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on,relax_rmrr iommu=pt intremap=no_x2apic_optout" 97 | ``` 98 | - *Side note: these are actually options which will make your PCI passthrough work and do so efficiently* 99 | 3. Running `update-grub` 100 | 4. Rebooting 101 | 102 | To verify if the the patch is active execute `dmesg | grep 'Intel-IOMMU'` after reboot. You should see a result similar 103 | to this: 104 | 105 | ``` 106 | root@sandbox:~# dmesg | grep 'Intel-IOMMU' 107 | [ 0.050195] DMAR: Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss 108 | root@sandbox:~# 109 | ``` 110 | 111 | --- 112 | 113 | ### Disclaimers 114 | - I'm not a kernel programmer by any means, so if I got something horribly wrong correct me please :) 115 | - This path should be safe, as long as you don't try to remap devices which are used by the IPMI/BIOS, e.g. 116 | - Network port shared between your IPMI and OS 117 | - RAID card in non-HBA mode with its driver loaded on the host 118 | - Network card with monitoring system installed on the host (e.g. [Intel Active Health System Agent](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229)) 119 | - This is not a supported solution by any of the vendors. In fact this is a direct violation of Intel's VT-d specs 120 | (which Linux already violates anyway, but this is increasing the scope). It may cause crashes or major instabilities. 121 | You've been warned. 122 | 123 | --- 124 | 125 | ### Acknowledgments & References 126 | - [Comment-out hack research by dschense](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675) 127 | - [Proxmox kernel compilation & patching by Feni](https://forum.proxmox.com/threads/compile-proxmox-ve-with-patched-intel-iommu-driver-to-remove-rmrr-check.36374/) 128 | - [Linux IOMMU Support](https://www.kernel.org/doc/html/latest/x86/intel-iommu.html) 129 | - [RedHat RMRR EXCLUSION Whitepaper](https://access.redhat.com/sites/default/files/attachments/rmrr-wp1.pdf) 130 | - [Intel® Virtualization Technology for Directed I/O (VT-d)](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html) 131 | - [Intel® Virtualization Technology for Directed I/O Architecture Specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html) 132 | 133 | --- 134 | 135 | ### License 136 | This work (patches & docs) is dual-licensed under MIT and GPL 2.0 (or any later version), which should be treated as an 137 | equivalent of Linux `Dual MIT/GPL` (i.e. pick a license you prefer). 138 | -------------------------------------------------------------------------------- /deep-dive.md: -------------------------------------------------------------------------------- 1 | ### Deep Dive into the problem 2 | 3 | ### Table of Contents 4 | 1. [Installation](README.md#installation) 5 | - [Proxmox - premade packages](README.md#proxmox---premade-packages) 6 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources) 7 | - [Other distros](README.md#other-distros) 8 | 2. [Configuration](README.md#configuration) 9 | 3. **Deep Dive** <= you're here 10 | - [Technical details](deep-dive.md#technical-details) 11 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory) 12 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi) 13 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work) 14 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet) 15 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong) 16 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks) 17 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor) 18 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs) 19 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp) 20 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53) 21 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317) 22 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea) 23 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly) 24 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does) 25 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module) 26 | - [The future](deep-dive.md#the-future) 27 | 4. [Disclaimers](README.md#disclaimers) 28 | 5. [Acknowledgments & References](README.md#acknowledgments--references) 29 | 6. [License](README.md#license) 30 | 31 | --- 32 | 33 | ### Technical details 34 | 35 | #### How virtual machines use memory? 36 | To understand PCI passthrough we first need to understand how VMs work. Each VM launched in the system gets a new 37 | virtual address space and has no direct access to the host memory. Yet, the guest OS runs like it was running with a 38 | real RAM, using any memory addresses it wants. In other words the guest OS has no idea (in terms of memory) that it is 39 | being virtualized. Logically there has to be some map to translate guest OS requests to the real memory addresses, since 40 | multiple guest OSes has to share the same physical host memory. The hypervisor (host OS) is responsible for maintaining 41 | a map between GPA (Guest Address Space) and HPA (Host Physical Address). To better understand this look at the (VERY 42 | simplified) graphics: 43 | 44 | ``` 45 | +--------------------------------HOST----------------------------------------+ 46 | | | 47 | | +--------------------------HOST MEMORY-------------------------------+ | 48 | | | +-------+ +----------GUEST MEMORY-----------+ | | 49 | | | | vim | |---------------------------------| | | 50 | | | | mem | |---------------------------------| | | 51 | | | +-------+ +---------------------------------+ | | 52 | | | 0xA000 0xA100 | | 53 | | +--------------------------------------------------------------------+ | 54 | | 0x0000 0xF000 0xF0FF 0x....| 55 | | | 56 | | +--------+ +----------------GUEST VM------------------+ | 57 | | | | | +------------GUEST MEMORY--------------+ | | 58 | | | vim | | | | | | | | 59 | | | | | | guest kernel| wget | | | | 60 | | +--------+ | | | mem | | | | 61 | | | +-------------+--------+---------------+ | | 62 | | | 0x00 0x1E 0x20 0xFF | | 63 | | | +------+ | | 64 | | | | wget | | | 65 | | | +------+ | | 66 | | +------------------------------------------+ | 67 | +----------------------------------------------------------------------------+ 68 | 69 | (addresses don't represent real x86 space[!] and are not drawn to scale) 70 | ``` 71 | 72 | When a VM is run the hypervisor gives it a predetermined amount of memory and tells the gust OS that it has a contagious 73 | space of 255 bytes. The guest OS knows it can use 255 bytes from 0x00 and doesn't care/know where this memory physically 74 | resides. Host OS now needs to find space for 255 bytes, either in one or multiple chunks in the physical memory. It can 75 | map it as on the diagram to one big chunk or split it into multiple ones, as long as it can map guest request for its 76 | `0x1E`-`0x20` to e.g. `0xF010`-`0xF012` and return the data. 77 | 78 | --- 79 | 80 | #### Why do we need VT-d / AMD-Vi? 81 | While mapping the memory (as described in the previous section) the host OS must take care of three things: 82 | 1. When guest OS requests a page from memory using its (GPA) address it will get it from the HPA-addressed memory (=mapping) 83 | 2. Memory of the guest cannot be touched by anything other than the guest (=protection) 84 | 3. The process needs to be fast 85 | 86 | While the first two are achievable with pure software emulation, it makes the memory access process slow as molasses 87 | since it can no longer rely on [DMA](https://en.wikipedia.org/wiki/Direct_memory_access) but involve CPU for every 88 | shifting bytes back and forth. 89 | Both VT-d and AMD-Vi allow to essentially instruct the hardware to do the mapping and enforce domains (security 90 | boundaries). In such case host OS simply needs to inform the hardware about the address to be translated on-the-fly. 91 | 92 | More on that can be found in the [Intel VT-d docs](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html). 93 | 94 | --- 95 | 96 | #### How PCI/PCIe actually work? 97 | Most people blindly plop `intel_iommu=on` and `iommu=pt` into their kernel line and get surprised when things don't 98 | work. I did too, so I started digging, which resulted in this whole repository. 99 | 100 | Every device in the system has some memory reserved memory address space. It's used by the device and the the host 101 | system to communicate and exchange data. That reserved memory address is dictated by the firmware (i.e. BIOS) as both 102 | the device and OS must know it to communicate. In essence this is just slightly different than normal memory mapping. 103 | Here, you don't have just some OS using the memory but an OS **and** a device using the memory. 104 | 105 | Here's where [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) comes into play. In essence it's 106 | able to remap GPA to HPA for both the OS and the device so that they can talk to each other. When device memory is 107 | remapped the guest OS talks to the hardware like it was really under some physical address it expects, while in reality 108 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) moves the reserved region aperture 109 | somewhere else in the address space. This is *usually* fine. 110 | 111 | --- 112 | 113 | #### RMRR - the monster in a closet 114 | While both AMD and Intel allow for [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) remapping 115 | device's memory, Intel had an idea to introduce RMRR (Reserved Memory Region Reporting). In essence the firmware/BIOS 116 | publishes a list of regions where usage of [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) is 117 | ostensibly prohibited. The original intent for that feature was good, by allowing for USB keyboards to be automagically 118 | emulated by the USB controller itself before USB driver is loaded, like they were connected via PS/2. This also allow 119 | the GPU to display the picture before OS is loaded and even before [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 120 | is initialized. 121 | However, it required some sacrifices: that memory should not be remapped as only OS and the device use the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 122 | and devices on the motherboard which may be communicating with e.g. the GPU pre-boot don't know anything about the 123 | mapping. 124 | 125 | However, one *undocumented assumption* was made: as soon as the driver is loaded the "out-of-band" access to the device 126 | ends and the the OS takes over. However, *technically* the VT-d specification says that the RMRR is valid indefinitely. 127 | 128 | Linux for long time (up until [v3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee)) 129 | didn't respect RMRR while setting up [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 130 | resptcing that against-the-specs but ubiquitous assumption. This was an oversight as [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) 131 | API assumes exclusive control over the remapped address space. If such space is remapped the DMA access from outside of 132 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) domain (i.e. from something else than the 133 | host or VM guest OS, like a device on the motherboard) will fail which may lead to unpredictable results if the hardware 134 | vendor didn't follow the *undocumented assumption*. 135 | 136 | 137 | Linux, as of now, excludes two specific classes of devices form being constricted by RMRR: 138 | - USB devices (as we historically trust they don't do weird things) 139 | - GPUs (unspoken rule that they're accessed out-of-band only before the driver loads) 140 | 141 | 142 | RMRR *by itself* isn't evil, as long as it's used as [Intel's VT-d specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html) 143 | intended - "*[RMRRs] that are either not DMA targets, or memory ranges that may be target of BIOS 144 | initiated DMA only during pre-boot phase (such as from a boot disk drive) **must not** be included in the reserved 145 | memory region reporting.*". 146 | 147 | 148 | Intel anticipated the some will be tempted to misuse the feature as they warned in the VT-d specification: "*RMRR 149 | regions are expected to be used for legacy usages (...). Platform designers should avoid or limit use of reserved memory 150 | regions*". 151 | 152 | ---- 153 | 154 | #### What vendors did wrong? 155 | HP (and probably others) decided to mark **every freaking PCI device memory space as RMRR!**`*` Like that, 156 | just in case... just that their tools could potentially maybe monitor these devices while OS agent is not installed. But 157 | wait, there's more! They marked **ALL** devices as such, even third party ones physically installed in motherboard's 158 | PCI/PCIe slots! 159 | 160 | This in turn killed PCI passthrough for any of the devices in systems running Linux [>=3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee). 161 | 162 | *`*` In case you skipped other sections above, RMRR is a special part of the memory which cannot be moved 163 | to a VM.* 164 | 165 | --- 166 | 167 | ### Other solutions & hacks 168 | 169 | #### Contact your platform vendor 170 | As the error suggests you can try to convince your vendor to fix the BIOS. If you do please create an issue in this repo 171 | to tell me about it, as this is **the only** real solution to the problem. 172 | 173 | --- 174 | 175 | #### Use OS which ignores RMRRs 176 | Some operating systems, notably [VMWare ESXi and vSphere](https://www.vmware.com/products/esxi-and-esx.html), are 177 | believed to ignore RMRRs (cannot be verified as they're closed-source). They're able to passthrough the devices without 178 | a problem, as long as you don't do something deliberately dangerous (see [Disclaimers](README.md#disclaimers)). 179 | 180 | --- 181 | 182 | #### Attempt HPE's pseudofix (if you use HP) 183 | To HPE's credit, they [recognized the problem and released an advisory with mitigations](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229). 184 | In short the HPE's solution is threefold: 185 | 1. Fix the firmware to not include GPUs in RMRR 186 | 2. Use System Configuration utility on Gen9+ servers to disable "HP Shared Memory features" on selected HPs cards 187 | 3. Use their CLI BIOS/RBSU reconfiguration utility to set a special (invisible in menus) flags opting-out PCIe slots 188 | from "smart monitoring" 189 | 190 | However, we wouldn't be here if it actually worked as expected: 191 | - Fix 1 works only on GPUs and affects Linux 3.17-5.4 (as kernel has GPU exclusion since 5.4) 192 | - Fix 2 only works on *some* **external** HPE ethernet adapters with Gen9 and newer servers 193 | - Fix 3 theoretically works on all NICs, but not other cards (e.g. HBAs) and [doesn't actually work](https://community.hpe.com/t5/proliant-servers-netservers/microserver-gen8-quot-device-is-ineligible-for-iommu-domain/td-p/6947461#.X5D7SS9h1TY) 194 | (sic!) on some servers which are listed as affected (e.g. widely popular [HP/HPE Microserver Gen8](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c03793258)) 195 | 196 | Some tried [opening a support case](https://community.hpe.com/t5/proliant-servers-netservers/re-device-is-ineligible-for-iommu-domain-attach-due-to-platform/m-p/6817728/highlight/true#M21006) 197 | but the topic dried out. I tried [nagging HPE to fix the BIOS](https://community.hpe.com/t5/proliant-servers-ml-dl-sl/disabling-rmrds-rmrr-hp-shared-memory-features-on-microserver/td-p/7105623#.X5C0oy9h2uV). 198 | Maybe there's a chance? Who knows... the future will show. 199 | 200 | --- 201 | 202 | #### The comment-the-error-out hack (v3.17 - 5.3) 203 | I was able to track the first mentions of this method to [a post by dschense on a German Proxmox forum](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675) 204 | ([en version](https://translate.googleusercontent.com/translate_c?depth=2&pto=aue&rurl=translate.google.com&sl=de&tl=en&u=https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)). 205 | 206 | In essence this was a logical conclusion: if you have an error comment it out and see what happens. It worked on the 207 | original protection being introduced in Linux v3.17. Unfortunately, the Linux v5.3 changed a lot (see [next section](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)). 208 | 209 | --- 210 | 211 | #### Long-term solution - utilizing relaxable reservation regions (>=3.17) 212 | 213 | ##### Why commenting-out the error is a bad idea 214 | Before Linux v5.3 RMRRs protection relied on [a simple patch introduced in v3.17](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee) 215 | which excluded USB devices. [Commenting out the error](#the-comment-the-error-out-hack-v317---53) was a working 216 | solution, as the kernel (including KVM subsystem) didn't care about the reserved regions. 217 | 218 | The situation changed dramatically. A large change aimed to [introduce IOVA list management](https://patchwork.kernel.org/project/kvm/cover/20190723160637.8384-1-shameerali.kolothum.thodi@huawei.com/) 219 | outside of the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) driver was introduced. About 220 | the same time the RMRRs reserved memory [was split into two logical buckets](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12): 221 | absolutely-reserved (`IOMMU_RESV_DIRECT`) and so-called relaxed (`IOMMU_RESV_DIRECT_RELAXABLE`). USB devices and now 222 | GPUs were marked as *"relaxable"* as they were deemed safe to be remapped (even if against the VT-d specs and 223 | firmware's will). 224 | 225 | 226 | ##### The kernel moves on quickly 227 | Other subsystems naturally [started utilizing](https://github.com/torvalds/linux/commit/9b77e5c79840fc334a5b7f770c5ab0c09dc0e028) 228 | that new IOVA interface, which broke the *"[comment-the-error-out](#the-comment-the-error-out-hack-v317---53)"* patch. 229 | Now with the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) error message commented out QEMU 230 | [will explode on vfio_dma_map()](https://bugs.launchpad.net/qemu/+bug/1869006/comments/14). 231 | Understandably, and for good reasons, [developers refuses to accommodate any requests to disable that](https://bugs.launchpad.net/qemu/+bug/1869006/comments/18). 232 | While even more checks can be commented-out and patched, as more subsystems in the kernel start relying on the IOVA 233 | lists management, it will be a cat-and-mouse game after every kernel release. 234 | 235 | 236 | ##### What this patch actually does 237 | The path plugs into the same mechanism as the vanilla kernel used to [mark USB and GPUs as "relaxable"](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12). 238 | This has three benefits: 239 | - The RMRR is not fully NULLified, as the memory is marked as reserved-with-exceptions and not just not reserved. This, 240 | combined with IOVA list management ensures that if some code somewhere needs to work differently with relaxable 241 | devices it will work with this patch properly. 242 | - This patch doesn't introduce inconsistent state in the kernel. RMRRs are not hidden from the kernel by removal, nor 243 | ignored just in one place. This patch just changes the designation of these regions from `IOMMU_RESV_DIRECT` (*"we 244 | know it's reserved and we will hold your hand"*) to [`IOMMU_RESV_DIRECT_RELAXABLE`](https://lore.kernel.org/patchwork/patch/1079954/) 245 | (*"we know it's reserved but it's your playground"*). 246 | - It works across all affected kernels (v5.9.1 being the newest at the time of writing) 247 | 248 | Additionally, this mechanism is [controllable with a boot option](README.md#configuration) making it safe and easy to 249 | disable as needed. 250 | 251 | 252 | ##### Why kernel patch and not a loadable module? 253 | Before taking this approach I poked around to see if the [IOMM driver](https://github.com/torvalds/linux/tree/master/drivers/iommu/intel) 254 | has any API around RMRR. It does not. The driver doesn't export any functions which can make the module feasible. 255 | While Linux >=5.3 has the IOVA list management interface, it is [being built by the Intel IOMMU driver](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12). 256 | What it means is the hardcoded relaxable logic [decides about IOVA designation](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12#diff-e1fff7a2368c04e11696812359f854de9da431c63ec7c5a7bec8f6020e112a2aR2916). 257 | Late on the same logic is [used for final sanity](https://github.com/torvalds/linux/blob/5f9e832c137075045d15cd6899ab0505cfb2ca4b/drivers/iommu/intel-iommu.c#L5057) 258 | independently from the state of the memory saved in the IOVA list. Only after this check passes the IOMMU mapping is 259 | added. 260 | 261 | In other words even if >=5.4 [IOVA API is used to modify](https://github.com/torvalds/linux/commit/af029169b8fdae31064624d60b5469a3da95ad32) 262 | the assignment, the actual IOMU remapping will fail with *"Device is ineligible for IOMMU domain attach..."* error. 263 | 264 | 265 | #### The future 266 | It will be great if this patch could be upstreamed. However, I see slim-to-none chance of that happening, as this change 267 | is prone to abuse. However, I will definitely try to communicate with kernel folks on how to proceed. 268 | --------------------------------------------------------------------------------