├── .gitignore
├── patches
├── proxmox.patch
├── proxmox7.patch
├── add-relaxable-rmrr-5_8_and_up.patch
├── add-relaxable-rmrr-5_11.patch
├── add-relaxable-rmrr-below-5_8.patch
├── add-relaxable-rmrr-5_13.patch
└── add-relaxable-rmrr-5_15.patch
├── README.md
└── deep-dive.md
/.gitignore:
--------------------------------------------------------------------------------
1 | proxmox-kernel/
2 |
--------------------------------------------------------------------------------
/patches/proxmox.patch:
--------------------------------------------------------------------------------
1 | --- a/Makefile
2 | +++ b/Makefile
3 | @@ -11,7 +11,7 @@
4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN)
5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL)
6 |
7 | -EXTRAVERSION=-${KREL}-pve
8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr
9 | KVNAME=${KERNEL_VER}${EXTRAVERSION}
10 | PACKAGE=pve-kernel-${KVNAME}
11 | HDRPACKAGE=pve-headers-${KVNAME}
12 | --- a/debian/scripts/find-firmware.pl
13 | +++ b/debian/scripts/find-firmware.pl
14 | @@ -8,7 +8,7 @@
15 |
16 | die "no such directory" if ! -d $dir;
17 |
18 | -die "strange directory name: $dir" if $dir !~ m|^(.*/)?(5.\d.\d+\-\d+\-pve)(/+)?$|;
19 | +#die "strange directory name: $dir" if $dir !~ m|^(.*/)?(5.\d.\d+\-\d+\-pve)(/+)?$|;
20 |
21 | my $apiver = $2;
22 |
--------------------------------------------------------------------------------
/patches/proxmox7.patch:
--------------------------------------------------------------------------------
1 | --- a/Makefile
2 | +++ b/Makefile
3 | @@ -11,7 +11,7 @@
4 | KERNEL_MAJMIN=$(KERNEL_MAJ).$(KERNEL_MIN)
5 | KERNEL_VER=$(KERNEL_MAJMIN).$(KERNEL_PATCHLEVEL)
6 |
7 | -EXTRAVERSION=-${KREL}-pve
8 | +EXTRAVERSION=-${KREL}-pve-relaxablermrr
9 | KVNAME=${KERNEL_VER}${EXTRAVERSION}
10 | PACKAGE=pve-kernel-${KVNAME}
11 | HDRPACKAGE=pve-headers-${KVNAME}
12 | --- a/debian/scripts/find-firmware.pl
13 | +++ b/debian/scripts/find-firmware.pl
14 | @@ -8,7 +8,7 @@
15 |
16 | die "no such directory" if ! -d $dir;
17 |
18 | -die "strange directory name: $dir" if $dir !~ m|^(.*/)?(\d+.\d+.\d+\-\d+\-pve)(/+)?$|;
19 | +#die "strange directory name: $dir" if $dir !~ m|^(.*/)?(\d+.\d+.\d+\-\d+\-pve)(/+)?$|;
20 |
21 |
22 | my $apiver = $2;
23 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_8_and_up.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c
2 | +++ b/drivers/iommu/intel/iommu.c
3 | @@ -356,6 +356,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int intel_no_bounce;
7 | +static int intel_relaxable_rmrr = 0;
8 | static int iommu_skip_te_disable;
9 |
10 | #define IDENTMAP_GFX 2
11 | @@ -463,6 +464,9 @@
12 | } else if (!strncmp(str, "nobounce", 8)) {
13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
14 | intel_no_bounce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | }
19 |
20 | str += strcspn(str, ",");
21 | @@ -2863,7 +2867,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
31 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_11.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2021-07-30 16:21:22.235520365 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2021-07-30 16:28:28.905719413 +0100
3 | @@ -355,6 +355,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -455,7 +456,10 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | - }
16 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
17 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
18 | + intel_relaxable_rmrr = 1;
19 | + }
20 |
21 | str += strcspn(str, ",");
22 | while (*str == ',')
23 | @@ -2802,7 +2806,7 @@
24 | return false;
25 |
26 | pdev = to_pci_dev(dev);
27 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
28 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
29 | return true;
30 | else
31 | return false;
32 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-below-5_8.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel-iommu.c
2 | +++ b/drivers/iommu/intel-iommu.c
3 | @@ -367,6 +367,7 @@ static int intel_iommu_strict;
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int intel_no_bounce;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_ALL 1
10 | #define IDENTMAP_GFX 2
11 | @@ -468,6 +469,9 @@ static int __init intel_iommu_setup(char *str)
12 | } else if (!strncmp(str, "nobounce", 8)) {
13 | pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
14 | intel_no_bounce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | }
19 |
20 | str += strcspn(str, ",");
21 | @@ -2866,7 +2870,7 @@ static bool device_rmrr_is_relaxable(struct device *dev)
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_13.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2022-02-26 13:51:33.821885509 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-26 13:58:27.231463792 +0100
3 | @@ -364,6 +364,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -465,6 +466,9 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | } else {
19 | pr_notice("Unknown option - '%s'\n", str);
20 | }
21 | @@ -2846,7 +2850,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/patches/add-relaxable-rmrr-5_15.patch:
--------------------------------------------------------------------------------
1 | --- a/drivers/iommu/intel/iommu.c 2022-02-27 12:02:53.958814198 +0100
2 | +++ b/drivers/iommu/intel/iommu.c 2022-02-27 12:03:07.402842983 +0100
3 | @@ -338,6 +338,7 @@
4 | static int intel_iommu_superpage = 1;
5 | static int iommu_identity_mapping;
6 | static int iommu_skip_te_disable;
7 | +static int intel_relaxable_rmrr = 0;
8 |
9 | #define IDENTMAP_GFX 2
10 | #define IDENTMAP_AZALIA 4
11 | @@ -442,6 +443,9 @@
12 | } else if (!strncmp(str, "tboot_noforce", 13)) {
13 | pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
14 | intel_iommu_tboot_noforce = 1;
15 | + } else if (!strncmp(str, "relax_rmrr", 10)) {
16 | + pr_info("Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss\n");
17 | + intel_relaxable_rmrr = 1;
18 | } else {
19 | pr_notice("Unknown option - '%s'\n", str);
20 | }
21 | @@ -2824,7 +2828,7 @@
22 | return false;
23 |
24 | pdev = to_pci_dev(dev);
25 | - if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
26 | + if (intel_relaxable_rmrr || IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
27 | return true;
28 | else
29 | return false;
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🍻 Relaxed RMRR Mapping for Linux 3.17+
2 | ## 🐧💨 Now you can use PCI passthrough on broken platforms
3 |
4 | ### TL;DR
5 | When you try to use PCI/PCIe passthrough in KVM/QEMU/Proxmox you get:
6 | ```
7 | vfio-pci 0000:01:00.1: Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.
8 | ```
9 | followed by `vfio: failed to set iommu for container: Operation not permitted`.
10 |
11 | This kernel patch fixes the problem **on kernels v3.17 and up** (tested up to 5.9.1). You can skip to "[Installation](README.md#installation)"
12 | section if you don't care about the rest. Reading of "[Disclaimers](README.md#disclaimers)" section to understand the
13 | risks, and "[Solutions & hacks](deep-dive.md#other-solutions--hacks)" to get the idea of different alternatives is
14 | highly recommended.
15 |
16 | ---
17 |
18 | ### Table of Contents
19 | 1. [Installation](README.md#installation)
20 | - [Proxmox - premade packages](README.md#proxmox---premade-packages-easy)
21 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources-advanced)
22 | - [Other distros](README.md#other-distros)
23 | 2. [Configuration](README.md#configuration)
24 | 3. [Deep Dive](deep-dive.md) - *a throughout research on the problem written for mortals*
25 | - [Technical details](deep-dive.md#technical-details)
26 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory)
27 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi)
28 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work)
29 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet)
30 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong)
31 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks)
32 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor)
33 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs)
34 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp)
35 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53)
36 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)
37 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea)
38 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly)
39 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does)
40 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module)
41 | - [The future](deep-dive.md#the-future)
42 | 4. [Disclaimers](README.md#disclaimers)
43 | 5. [Acknowledgments & References](README.md#acknowledgments--references)
44 | 6. [License](README.md#license)
45 |
46 | ---
47 |
48 | ### Installation
49 |
50 | #### Proxmox - premade packages (easy)
51 | As I believe in *[eating your own dog food](https://en.wikipedia.org/wiki/Eating_your_own_dog_food)* I run the kernel
52 | described here. Thus, I publish precompiled packages.
53 |
54 | 1. Go to the [releases tab](https://github.com/kiler129/relax-intel-rmrr/releases/) and pick appropriate packages
55 | 2. Download all `*.deb`s packages to the server (you can copy links and use `wget https://...` on the server itself)
56 | 3. Install all using `dpkg -i *.deb` in the folder where you downloaded the debs
57 | 4. *(OPTIONAL)* Verify the kernel works with the patch disabled by rebooting and checking if `uname -r` shows a version
58 | ending with `-pve-relaxablermrr`
59 | 5. [Configure the kernel](README.md#configuration)
60 |
61 | ---
62 |
63 | #### Proxmox - building from sources (advanced)
64 | If you're running a version of Proxmox with [no packages available](README.md#proxmox---premade-packages-easy) you can
65 | [compile the kernel yourself using patches provided](build/proxmox/).
66 |
67 | ---
68 |
69 | #### Other distros
70 | 1. Download kernel sources appropriate for your distribution
71 | 2. Apply an appropriate patch to the source tree
72 | - Go to the folder with your kernel source
73 | - For Linux 3.17 - 5.7: `patch -p1 < ../patches/add-relaxable-rmrr-below-5_8.patch`
74 | - For Linux >=5.8: `patch -p1 < ../patches/add-relaxable-rmrr-5_8_and_up.patch`
75 | 3. Follow your distro kernel compilation & installation instruction:
76 | - [Debian](https://wiki.debian.org/BuildADebianKernelPackage)
77 | - [Ubuntu](https://wiki.ubuntu.com/Kernel/BuildYourOwnKernel)
78 |
79 | ***TODO:*** *Add automation script*
80 |
81 | ---
82 |
83 | ### Configuration
84 | By default, after the kernel is installed, the patch will be *inactive* (i.e. the kernel will behave like this patch was
85 | never applied). To activate it you have to add `intel_iommu=relax_rmrr` to your Linux boot args.
86 |
87 | In most distros (including Proxmox) you do this by:
88 | 1. Opening `/etc/default/grub` (e.g. using `nano /etc/default/grub`)
89 | 2. Editing the `GRUB_CMDLINE_LINUX_DEFAULT` to include the option:
90 | - Example of old line:
91 | ```
92 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on iommu=pt intremap=no_x2apic_optout"
93 | ```
94 | - Example of new line:
95 | ```
96 | GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on,relax_rmrr iommu=pt intremap=no_x2apic_optout"
97 | ```
98 | - *Side note: these are actually options which will make your PCI passthrough work and do so efficiently*
99 | 3. Running `update-grub`
100 | 4. Rebooting
101 |
102 | To verify if the the patch is active execute `dmesg | grep 'Intel-IOMMU'` after reboot. You should see a result similar
103 | to this:
104 |
105 | ```
106 | root@sandbox:~# dmesg | grep 'Intel-IOMMU'
107 | [ 0.050195] DMAR: Intel-IOMMU: assuming all RMRRs are relaxable. This can lead to instability or data loss
108 | root@sandbox:~#
109 | ```
110 |
111 | ---
112 |
113 | ### Disclaimers
114 | - I'm not a kernel programmer by any means, so if I got something horribly wrong correct me please :)
115 | - This path should be safe, as long as you don't try to remap devices which are used by the IPMI/BIOS, e.g.
116 | - Network port shared between your IPMI and OS
117 | - RAID card in non-HBA mode with its driver loaded on the host
118 | - Network card with monitoring system installed on the host (e.g. [Intel Active Health System Agent](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229))
119 | - This is not a supported solution by any of the vendors. In fact this is a direct violation of Intel's VT-d specs
120 | (which Linux already violates anyway, but this is increasing the scope). It may cause crashes or major instabilities.
121 | You've been warned.
122 |
123 | ---
124 |
125 | ### Acknowledgments & References
126 | - [Comment-out hack research by dschense](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)
127 | - [Proxmox kernel compilation & patching by Feni](https://forum.proxmox.com/threads/compile-proxmox-ve-with-patched-intel-iommu-driver-to-remove-rmrr-check.36374/)
128 | - [Linux IOMMU Support](https://www.kernel.org/doc/html/latest/x86/intel-iommu.html)
129 | - [RedHat RMRR EXCLUSION Whitepaper](https://access.redhat.com/sites/default/files/attachments/rmrr-wp1.pdf)
130 | - [Intel® Virtualization Technology for Directed I/O (VT-d)](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html)
131 | - [Intel® Virtualization Technology for Directed I/O Architecture Specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html)
132 |
133 | ---
134 |
135 | ### License
136 | This work (patches & docs) is dual-licensed under MIT and GPL 2.0 (or any later version), which should be treated as an
137 | equivalent of Linux `Dual MIT/GPL` (i.e. pick a license you prefer).
138 |
--------------------------------------------------------------------------------
/deep-dive.md:
--------------------------------------------------------------------------------
1 | ### Deep Dive into the problem
2 |
3 | ### Table of Contents
4 | 1. [Installation](README.md#installation)
5 | - [Proxmox - premade packages](README.md#proxmox---premade-packages)
6 | - [Proxmox - building from sources](README.md#proxmox---building-from-sources)
7 | - [Other distros](README.md#other-distros)
8 | 2. [Configuration](README.md#configuration)
9 | 3. **Deep Dive** <= you're here
10 | - [Technical details](deep-dive.md#technical-details)
11 | - [How virtual machines use memory?](deep-dive.md#how-virtual-machines-use-memory)
12 | - [Why do we need VT-d / AMD-Vi?](deep-dive.md#why-do-we-need-vt-d--amd-vi)
13 | - [How PCI/PCIe actually work?](deep-dive.md#how-pcipcie-actually-work)
14 | - [RMRR - the monster in a closet](deep-dive.md#rmrr---the-monster-in-a-closet)
15 | - [What vendors did wrong?](deep-dive.md#what-vendors-did-wrong)
16 | - [Other solutions & hacks](deep-dive.md#other-solutions--hacks)
17 | - [Contact your platform vendor](deep-dive.md#contact-your-platform-vendor)
18 | - [Use OS which ignores RMRRs](deep-dive.md#use-os-which-ignores-rmrrs)
19 | - [Attempt HPE's pseudofix (if you use HP)](deep-dive.md#attempt-hpes-pseudofix-if-you-use-hp)
20 | - [The comment-the-error-out hack (v3.17 - 5.3)](deep-dive.md#the-comment-the-error-out-hack-v317---53)
21 | - [Long-term solution - utilizing relaxable reservation regions (>=3.17)](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)
22 | - [Why commenting-out the error is a bad idea](deep-dive.md#why-commenting-out-the-error-is-a-bad-idea)
23 | - [The kernel moves on quickly](deep-dive.md#the-kernel-moves-on-quickly)
24 | - [What this patch actually does](deep-dive.md#what-this-patch-actually-does)
25 | - [Why kernel patch and not a loadable module?](deep-dive.md#why-kernel-patch-and-not-a-loadable-module)
26 | - [The future](deep-dive.md#the-future)
27 | 4. [Disclaimers](README.md#disclaimers)
28 | 5. [Acknowledgments & References](README.md#acknowledgments--references)
29 | 6. [License](README.md#license)
30 |
31 | ---
32 |
33 | ### Technical details
34 |
35 | #### How virtual machines use memory?
36 | To understand PCI passthrough we first need to understand how VMs work. Each VM launched in the system gets a new
37 | virtual address space and has no direct access to the host memory. Yet, the guest OS runs like it was running with a
38 | real RAM, using any memory addresses it wants. In other words the guest OS has no idea (in terms of memory) that it is
39 | being virtualized. Logically there has to be some map to translate guest OS requests to the real memory addresses, since
40 | multiple guest OSes has to share the same physical host memory. The hypervisor (host OS) is responsible for maintaining
41 | a map between GPA (Guest Address Space) and HPA (Host Physical Address). To better understand this look at the (VERY
42 | simplified) graphics:
43 |
44 | ```
45 | +--------------------------------HOST----------------------------------------+
46 | | |
47 | | +--------------------------HOST MEMORY-------------------------------+ |
48 | | | +-------+ +----------GUEST MEMORY-----------+ | |
49 | | | | vim | |---------------------------------| | |
50 | | | | mem | |---------------------------------| | |
51 | | | +-------+ +---------------------------------+ | |
52 | | | 0xA000 0xA100 | |
53 | | +--------------------------------------------------------------------+ |
54 | | 0x0000 0xF000 0xF0FF 0x....|
55 | | |
56 | | +--------+ +----------------GUEST VM------------------+ |
57 | | | | | +------------GUEST MEMORY--------------+ | |
58 | | | vim | | | | | | | |
59 | | | | | | guest kernel| wget | | | |
60 | | +--------+ | | | mem | | | |
61 | | | +-------------+--------+---------------+ | |
62 | | | 0x00 0x1E 0x20 0xFF | |
63 | | | +------+ | |
64 | | | | wget | | |
65 | | | +------+ | |
66 | | +------------------------------------------+ |
67 | +----------------------------------------------------------------------------+
68 |
69 | (addresses don't represent real x86 space[!] and are not drawn to scale)
70 | ```
71 |
72 | When a VM is run the hypervisor gives it a predetermined amount of memory and tells the gust OS that it has a contagious
73 | space of 255 bytes. The guest OS knows it can use 255 bytes from 0x00 and doesn't care/know where this memory physically
74 | resides. Host OS now needs to find space for 255 bytes, either in one or multiple chunks in the physical memory. It can
75 | map it as on the diagram to one big chunk or split it into multiple ones, as long as it can map guest request for its
76 | `0x1E`-`0x20` to e.g. `0xF010`-`0xF012` and return the data.
77 |
78 | ---
79 |
80 | #### Why do we need VT-d / AMD-Vi?
81 | While mapping the memory (as described in the previous section) the host OS must take care of three things:
82 | 1. When guest OS requests a page from memory using its (GPA) address it will get it from the HPA-addressed memory (=mapping)
83 | 2. Memory of the guest cannot be touched by anything other than the guest (=protection)
84 | 3. The process needs to be fast
85 |
86 | While the first two are achievable with pure software emulation, it makes the memory access process slow as molasses
87 | since it can no longer rely on [DMA](https://en.wikipedia.org/wiki/Direct_memory_access) but involve CPU for every
88 | shifting bytes back and forth.
89 | Both VT-d and AMD-Vi allow to essentially instruct the hardware to do the mapping and enforce domains (security
90 | boundaries). In such case host OS simply needs to inform the hardware about the address to be translated on-the-fly.
91 |
92 | More on that can be found in the [Intel VT-d docs](https://software.intel.com/content/www/us/en/develop/articles/intel-virtualization-technology-for-directed-io-vt-d-enhancing-intel-platforms-for-efficient-virtualization-of-io-devices.html).
93 |
94 | ---
95 |
96 | #### How PCI/PCIe actually work?
97 | Most people blindly plop `intel_iommu=on` and `iommu=pt` into their kernel line and get surprised when things don't
98 | work. I did too, so I started digging, which resulted in this whole repository.
99 |
100 | Every device in the system has some memory reserved memory address space. It's used by the device and the the host
101 | system to communicate and exchange data. That reserved memory address is dictated by the firmware (i.e. BIOS) as both
102 | the device and OS must know it to communicate. In essence this is just slightly different than normal memory mapping.
103 | Here, you don't have just some OS using the memory but an OS **and** a device using the memory.
104 |
105 | Here's where [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) comes into play. In essence it's
106 | able to remap GPA to HPA for both the OS and the device so that they can talk to each other. When device memory is
107 | remapped the guest OS talks to the hardware like it was really under some physical address it expects, while in reality
108 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) moves the reserved region aperture
109 | somewhere else in the address space. This is *usually* fine.
110 |
111 | ---
112 |
113 | #### RMRR - the monster in a closet
114 | While both AMD and Intel allow for [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) remapping
115 | device's memory, Intel had an idea to introduce RMRR (Reserved Memory Region Reporting). In essence the firmware/BIOS
116 | publishes a list of regions where usage of [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) is
117 | ostensibly prohibited. The original intent for that feature was good, by allowing for USB keyboards to be automagically
118 | emulated by the USB controller itself before USB driver is loaded, like they were connected via PS/2. This also allow
119 | the GPU to display the picture before OS is loaded and even before [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
120 | is initialized.
121 | However, it required some sacrifices: that memory should not be remapped as only OS and the device use the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
122 | and devices on the motherboard which may be communicating with e.g. the GPU pre-boot don't know anything about the
123 | mapping.
124 |
125 | However, one *undocumented assumption* was made: as soon as the driver is loaded the "out-of-band" access to the device
126 | ends and the the OS takes over. However, *technically* the VT-d specification says that the RMRR is valid indefinitely.
127 |
128 | Linux for long time (up until [v3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee))
129 | didn't respect RMRR while setting up [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
130 | resptcing that against-the-specs but ubiquitous assumption. This was an oversight as [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit)
131 | API assumes exclusive control over the remapped address space. If such space is remapped the DMA access from outside of
132 | the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) domain (i.e. from something else than the
133 | host or VM guest OS, like a device on the motherboard) will fail which may lead to unpredictable results if the hardware
134 | vendor didn't follow the *undocumented assumption*.
135 |
136 |
137 | Linux, as of now, excludes two specific classes of devices form being constricted by RMRR:
138 | - USB devices (as we historically trust they don't do weird things)
139 | - GPUs (unspoken rule that they're accessed out-of-band only before the driver loads)
140 |
141 |
142 | RMRR *by itself* isn't evil, as long as it's used as [Intel's VT-d specification](https://software.intel.com/content/www/us/en/develop/download/intel-virtualization-technology-for-directed-io-architecture-specification.html)
143 | intended - "*[RMRRs] that are either not DMA targets, or memory ranges that may be target of BIOS
144 | initiated DMA only during pre-boot phase (such as from a boot disk drive) **must not** be included in the reserved
145 | memory region reporting.*".
146 |
147 |
148 | Intel anticipated the some will be tempted to misuse the feature as they warned in the VT-d specification: "*RMRR
149 | regions are expected to be used for legacy usages (...). Platform designers should avoid or limit use of reserved memory
150 | regions*".
151 |
152 | ----
153 |
154 | #### What vendors did wrong?
155 | HP (and probably others) decided to mark **every freaking PCI device memory space as RMRR!**`*` Like that,
156 | just in case... just that their tools could potentially maybe monitor these devices while OS agent is not installed. But
157 | wait, there's more! They marked **ALL** devices as such, even third party ones physically installed in motherboard's
158 | PCI/PCIe slots!
159 |
160 | This in turn killed PCI passthrough for any of the devices in systems running Linux [>=3.17rc1](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee).
161 |
162 | *`*` In case you skipped other sections above, RMRR is a special part of the memory which cannot be moved
163 | to a VM.*
164 |
165 | ---
166 |
167 | ### Other solutions & hacks
168 |
169 | #### Contact your platform vendor
170 | As the error suggests you can try to convince your vendor to fix the BIOS. If you do please create an issue in this repo
171 | to tell me about it, as this is **the only** real solution to the problem.
172 |
173 | ---
174 |
175 | #### Use OS which ignores RMRRs
176 | Some operating systems, notably [VMWare ESXi and vSphere](https://www.vmware.com/products/esxi-and-esx.html), are
177 | believed to ignore RMRRs (cannot be verified as they're closed-source). They're able to passthrough the devices without
178 | a problem, as long as you don't do something deliberately dangerous (see [Disclaimers](README.md#disclaimers)).
179 |
180 | ---
181 |
182 | #### Attempt HPE's pseudofix (if you use HP)
183 | To HPE's credit, they [recognized the problem and released an advisory with mitigations](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c04781229).
184 | In short the HPE's solution is threefold:
185 | 1. Fix the firmware to not include GPUs in RMRR
186 | 2. Use System Configuration utility on Gen9+ servers to disable "HP Shared Memory features" on selected HPs cards
187 | 3. Use their CLI BIOS/RBSU reconfiguration utility to set a special (invisible in menus) flags opting-out PCIe slots
188 | from "smart monitoring"
189 |
190 | However, we wouldn't be here if it actually worked as expected:
191 | - Fix 1 works only on GPUs and affects Linux 3.17-5.4 (as kernel has GPU exclusion since 5.4)
192 | - Fix 2 only works on *some* **external** HPE ethernet adapters with Gen9 and newer servers
193 | - Fix 3 theoretically works on all NICs, but not other cards (e.g. HBAs) and [doesn't actually work](https://community.hpe.com/t5/proliant-servers-netservers/microserver-gen8-quot-device-is-ineligible-for-iommu-domain/td-p/6947461#.X5D7SS9h1TY)
194 | (sic!) on some servers which are listed as affected (e.g. widely popular [HP/HPE Microserver Gen8](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-c03793258))
195 |
196 | Some tried [opening a support case](https://community.hpe.com/t5/proliant-servers-netservers/re-device-is-ineligible-for-iommu-domain-attach-due-to-platform/m-p/6817728/highlight/true#M21006)
197 | but the topic dried out. I tried [nagging HPE to fix the BIOS](https://community.hpe.com/t5/proliant-servers-ml-dl-sl/disabling-rmrds-rmrr-hp-shared-memory-features-on-microserver/td-p/7105623#.X5C0oy9h2uV).
198 | Maybe there's a chance? Who knows... the future will show.
199 |
200 | ---
201 |
202 | #### The comment-the-error-out hack (v3.17 - 5.3)
203 | I was able to track the first mentions of this method to [a post by dschense on a German Proxmox forum](https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)
204 | ([en version](https://translate.googleusercontent.com/translate_c?depth=2&pto=aue&rurl=translate.google.com&sl=de&tl=en&u=https://forum.proxmox.com/threads/hp-proliant-microserver-gen8-raidcontroller-hp-p410-passthrough-probleme.30547/post-155675)).
205 |
206 | In essence this was a logical conclusion: if you have an error comment it out and see what happens. It worked on the
207 | original protection being introduced in Linux v3.17. Unfortunately, the Linux v5.3 changed a lot (see [next section](deep-dive.md#long-term-solution---utilizing-relaxable-reservation-regions-317)).
208 |
209 | ---
210 |
211 | #### Long-term solution - utilizing relaxable reservation regions (>=3.17)
212 |
213 | ##### Why commenting-out the error is a bad idea
214 | Before Linux v5.3 RMRRs protection relied on [a simple patch introduced in v3.17](https://github.com/torvalds/linux/commit/c875d2c1b8083cd627ea0463e20bf22c2d7421ee)
215 | which excluded USB devices. [Commenting out the error](#the-comment-the-error-out-hack-v317---53) was a working
216 | solution, as the kernel (including KVM subsystem) didn't care about the reserved regions.
217 |
218 | The situation changed dramatically. A large change aimed to [introduce IOVA list management](https://patchwork.kernel.org/project/kvm/cover/20190723160637.8384-1-shameerali.kolothum.thodi@huawei.com/)
219 | outside of the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) driver was introduced. About
220 | the same time the RMRRs reserved memory [was split into two logical buckets](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12):
221 | absolutely-reserved (`IOMMU_RESV_DIRECT`) and so-called relaxed (`IOMMU_RESV_DIRECT_RELAXABLE`). USB devices and now
222 | GPUs were marked as *"relaxable"* as they were deemed safe to be remapped (even if against the VT-d specs and
223 | firmware's will).
224 |
225 |
226 | ##### The kernel moves on quickly
227 | Other subsystems naturally [started utilizing](https://github.com/torvalds/linux/commit/9b77e5c79840fc334a5b7f770c5ab0c09dc0e028)
228 | that new IOVA interface, which broke the *"[comment-the-error-out](#the-comment-the-error-out-hack-v317---53)"* patch.
229 | Now with the [IOMMU](https://en.wikipedia.org/wiki/Input–output_memory_management_unit) error message commented out QEMU
230 | [will explode on vfio_dma_map()](https://bugs.launchpad.net/qemu/+bug/1869006/comments/14).
231 | Understandably, and for good reasons, [developers refuses to accommodate any requests to disable that](https://bugs.launchpad.net/qemu/+bug/1869006/comments/18).
232 | While even more checks can be commented-out and patched, as more subsystems in the kernel start relying on the IOVA
233 | lists management, it will be a cat-and-mouse game after every kernel release.
234 |
235 |
236 | ##### What this patch actually does
237 | The path plugs into the same mechanism as the vanilla kernel used to [mark USB and GPUs as "relaxable"](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12).
238 | This has three benefits:
239 | - The RMRR is not fully NULLified, as the memory is marked as reserved-with-exceptions and not just not reserved. This,
240 | combined with IOVA list management ensures that if some code somewhere needs to work differently with relaxable
241 | devices it will work with this patch properly.
242 | - This patch doesn't introduce inconsistent state in the kernel. RMRRs are not hidden from the kernel by removal, nor
243 | ignored just in one place. This patch just changes the designation of these regions from `IOMMU_RESV_DIRECT` (*"we
244 | know it's reserved and we will hold your hand"*) to [`IOMMU_RESV_DIRECT_RELAXABLE`](https://lore.kernel.org/patchwork/patch/1079954/)
245 | (*"we know it's reserved but it's your playground"*).
246 | - It works across all affected kernels (v5.9.1 being the newest at the time of writing)
247 |
248 | Additionally, this mechanism is [controllable with a boot option](README.md#configuration) making it safe and easy to
249 | disable as needed.
250 |
251 |
252 | ##### Why kernel patch and not a loadable module?
253 | Before taking this approach I poked around to see if the [IOMM driver](https://github.com/torvalds/linux/tree/master/drivers/iommu/intel)
254 | has any API around RMRR. It does not. The driver doesn't export any functions which can make the module feasible.
255 | While Linux >=5.3 has the IOVA list management interface, it is [being built by the Intel IOMMU driver](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12).
256 | What it means is the hardcoded relaxable logic [decides about IOVA designation](https://github.com/torvalds/linux/commit/1c5c59fbad20a63954de07687e4a29af18d1be12#diff-e1fff7a2368c04e11696812359f854de9da431c63ec7c5a7bec8f6020e112a2aR2916).
257 | Late on the same logic is [used for final sanity](https://github.com/torvalds/linux/blob/5f9e832c137075045d15cd6899ab0505cfb2ca4b/drivers/iommu/intel-iommu.c#L5057)
258 | independently from the state of the memory saved in the IOVA list. Only after this check passes the IOMMU mapping is
259 | added.
260 |
261 | In other words even if >=5.4 [IOVA API is used to modify](https://github.com/torvalds/linux/commit/af029169b8fdae31064624d60b5469a3da95ad32)
262 | the assignment, the actual IOMU remapping will fail with *"Device is ineligible for IOMMU domain attach..."* error.
263 |
264 |
265 | #### The future
266 | It will be great if this patch could be upstreamed. However, I see slim-to-none chance of that happening, as this change
267 | is prone to abuse. However, I will definitely try to communicate with kernel folks on how to proceed.
268 |
--------------------------------------------------------------------------------