├── examples ├── Makefile └── uefi │ ├── elf_x86_64_efi.lds │ ├── Makefile │ ├── main.c │ └── mp_service.h ├── scripts ├── commands-gdb ├── run-qemu.sh ├── run-qemu-win10.sh ├── gdb-run-qemu-win10.sh └── run-qemu-hvci-win10.sh ├── interface ├── hypervisor │ ├── hypervisor.h │ ├── handler_if.h │ └── vmcall_if.h └── usermode │ ├── error.hpp │ ├── hypervisor.hpp │ └── hypervisor.cpp ├── hypervisor ├── platform │ ├── serial.h │ ├── util.h │ ├── spinlock.h │ ├── serial.c │ ├── arch.h │ ├── intrin.h │ ├── standard.h │ ├── nt.h │ └── intrin.asm ├── vmm │ ├── vmm.h │ ├── vmcall.h │ ├── shim.h │ ├── nested.h │ ├── handler.h │ ├── ept.h │ ├── shim.asm │ ├── vmm_reg.h │ ├── vmcall.c │ ├── vmm_common.h │ ├── nested.c │ ├── ept.c │ ├── handler.c │ └── vmm.c ├── interrupt │ ├── idt.h │ ├── idt.asm │ └── idt.c ├── memory │ ├── pmem.h │ ├── vmem.h │ ├── mem.h │ ├── mem.c │ ├── pmem.c │ └── vmem.c ├── impl_hooks.h ├── Makefile └── hypervisor.c ├── .gitmodules ├── Makefile ├── .gitignore └── README.md /examples/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: 3 | $(MAKE) -C uefi -------------------------------------------------------------------------------- /scripts/commands-gdb: -------------------------------------------------------------------------------- 1 | target remote :1234 2 | layout asm 3 | set disassembly-flavor intel 4 | -------------------------------------------------------------------------------- /interface/hypervisor/hypervisor.h: -------------------------------------------------------------------------------- 1 | #ifndef HYPERVISOR_H 2 | #define HYPERVISOR_H 3 | 4 | void hypervisor_init(void); 5 | 6 | #endif /* HYPERVISOR_H */ -------------------------------------------------------------------------------- /hypervisor/platform/serial.h: -------------------------------------------------------------------------------- 1 | #ifndef SERIAL_H 2 | #define SERIAL_H 3 | 4 | void serial_init(void); 5 | void serial_print(char *str); 6 | 7 | #endif /* SERIAL_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/vmm.h: -------------------------------------------------------------------------------- 1 | #ifndef VMM_H 2 | #define VMM_H 3 | 4 | #include "vmm_common.h" 5 | 6 | void vmm_init(struct vmm_init_params *params); 7 | 8 | #endif /* VMM_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/vmcall.h: -------------------------------------------------------------------------------- 1 | #ifndef VMCALL_H 2 | #define VMCALL_H 3 | 4 | #include "platform/standard.h" 5 | #include "vmm_common.h" 6 | 7 | struct vmcall_ctx *vmcall_init(struct vmm_ctx *vmm); 8 | 9 | #endif /* VMCALL_H */ -------------------------------------------------------------------------------- /hypervisor/interrupt/idt.h: -------------------------------------------------------------------------------- 1 | #ifndef IDT_H 2 | #define IDT_H 3 | 4 | #include "ia32_compact.h" 5 | 6 | void idt_init(segment_descriptor_register_64 *orig_idtr, segment_descriptor_register_64 *new_idtr); 7 | 8 | #endif /* IDT_H */ -------------------------------------------------------------------------------- /scripts/run-qemu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To exit do Ctrl-A X 4 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/bin/qemu-system-x86_64 build/uefi.efi -- -nographic -enable-kvm -cpu host -smp 1 -------------------------------------------------------------------------------- /hypervisor/vmm/shim.h: -------------------------------------------------------------------------------- 1 | #ifndef SHIM_H 2 | #define SHIM_H 3 | 4 | #include 5 | 6 | extern __attribute__((ms_abi)) void shim_guest_to_host(void); 7 | extern __attribute__((ms_abi)) void shim_host_to_guest(void); 8 | 9 | #endif /* SHIM_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/nested.h: -------------------------------------------------------------------------------- 1 | #ifndef NESTED_H 2 | #define NESTED_H 3 | 4 | #include "platform/standard.h" 5 | #include "vmm_common.h" 6 | 7 | void nested_init(struct vmm_ctx *vmm); 8 | void nested_init_vcpu(struct vcpu_ctx *vcpu); 9 | 10 | #endif /* NESTED_H */ -------------------------------------------------------------------------------- /hypervisor/memory/pmem.h: -------------------------------------------------------------------------------- 1 | #ifndef PMEM_H 2 | #define PMEM_H 3 | 4 | #include 5 | #include 6 | 7 | void pmem_init(void); 8 | uintptr_t pmem_alloc_page(void); 9 | uintptr_t pmem_alloc_contiguous(size_t bytes); 10 | void pmem_free_page(uintptr_t page); 11 | 12 | #endif /* PMEM_H */ -------------------------------------------------------------------------------- /interface/usermode/error.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ERROR_H 2 | #define ERROR_H 3 | 4 | #include 5 | 6 | #define user_die_on(cond, ...) do { \ 7 | if (cond) { \ 8 | printf(__VA_ARGS__); \ 9 | while (1) {} \ 10 | } \ 11 | } while (0) 12 | 13 | #endif /* ERROR_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/handler.h: -------------------------------------------------------------------------------- 1 | #ifndef HANDLER_H 2 | #define HANDLER_H 3 | 4 | #include "platform/standard.h" 5 | #include "vmm_common.h" 6 | #include "handler_if.h" 7 | 8 | struct handler_ctx *handler_init(struct vmm_ctx *vmm); 9 | __attribute__((ms_abi)) void handler_guest_to_host(struct vcpu_context *guest_ctx); 10 | 11 | #endif /* HANDLER_H */ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ia32-doc"] 2 | path = submodules/ia32-doc 3 | url = https://github.com/wbenny/ia32-doc.git 4 | [submodule "uefi-run"] 5 | path = submodules/uefi-run 6 | url = git@github.com:POPFD/uefi-run.git 7 | [submodule "hypervisor/platform/printf"] 8 | path = hypervisor/platform/printf 9 | url = https://github.com/mpaland/printf.git 10 | -------------------------------------------------------------------------------- /hypervisor/vmm/ept.h: -------------------------------------------------------------------------------- 1 | #ifndef EPT_H 2 | #define EPT_H 3 | 4 | #include "ia32_compact.h" 5 | 6 | struct ept_ctx *ept_init(void); 7 | eptp *ept_get_pointer(struct ept_ctx *ctx); 8 | ept_pde_2mb *get_ept_pml2e(struct ept_ctx *ctx, uintptr_t phys_addr); 9 | ept_pte *ept_get_pml1e(struct ept_ctx *ctx, uintptr_t phys_addr); 10 | void ept_invalidate_and_flush(struct ept_ctx *ctx); 11 | 12 | #endif /* EPT_H */ -------------------------------------------------------------------------------- /hypervisor/memory/vmem.h: -------------------------------------------------------------------------------- 1 | #ifndef VMEM_H 2 | #define VMEM_H 3 | 4 | #include "platform/standard.h" 5 | #include "ia32_compact.h" 6 | 7 | #define MEM_READ_ONLY (0) 8 | #define MEM_WRITE (1 << 0) 9 | #define MEM_EXECUTE (1 << 1) 10 | 11 | void vmem_init(cr3 *original_cr3, cr3 *new_cr3); 12 | void *vmem_alloc(size_t size, unsigned int flags); 13 | void vmem_change_perms(void *addr, size_t size, unsigned int flags); 14 | 15 | #endif /* VMEM_H */ -------------------------------------------------------------------------------- /hypervisor/platform/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include "standard.h" 5 | 6 | static inline void bitmap_clear_bit(uint8_t *bitmap, size_t bit) 7 | { 8 | size_t idx = bit / 8; 9 | size_t pos = bit % 8; 10 | 11 | bitmap[idx] &= ~(1 << pos); 12 | } 13 | 14 | static inline void bitmap_set_bit(uint8_t *bitmap, size_t bit) 15 | { 16 | size_t idx = bit / 8; 17 | size_t pos = bit % 8; 18 | 19 | bitmap[idx] |= 1 << pos; 20 | } 21 | 22 | #endif /* UTIL_H */ -------------------------------------------------------------------------------- /hypervisor/memory/mem.h: -------------------------------------------------------------------------------- 1 | #ifndef MEM_H 2 | #define MEM_H 3 | 4 | #include "platform/standard.h" 5 | #include "ia32_compact.h" 6 | 7 | enum copy_dir { 8 | COPY_READ, 9 | COPY_WRITE 10 | }; 11 | 12 | uintptr_t mem_va_to_pa(cr3 table, void *va); 13 | bool mem_copy_virt_tofrom_host(enum copy_dir dir, cr3 table, 14 | uintptr_t addr, void *buffer, size_t size); 15 | bool mem_copy_virt_to_virt(cr3 src_cr3, void *src, cr3 dest_cr3, void *dest, size_t size); 16 | 17 | 18 | #endif /* MEM_H */ -------------------------------------------------------------------------------- /interface/hypervisor/handler_if.h: -------------------------------------------------------------------------------- 1 | #ifndef HANDLER_COMMON_H 2 | #define HANDLER_COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef void (*vmexit_cbk_t)(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next); 9 | 10 | void handler_register_exit(struct handler_ctx *ctx, 11 | size_t exit_reason, 12 | vmexit_cbk_t callback, 13 | void *opaque, 14 | bool override); 15 | 16 | #endif /* HANDLER_COMMON_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/shim.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | 3 | global shim_guest_to_host 4 | 5 | extern __capture_context 6 | extern handler_guest_to_host 7 | extern vmm_hyperjack_handler 8 | 9 | shim_guest_to_host: 10 | ; Save the RCX register and then load into RCX the value 11 | ; where we will want to store our stack offsetting by the 12 | ; push we just did to preserve RCX. This is then passed 13 | ; as a parameter to capture_context so that the guest 14 | ; context is stored within the host stack. 15 | push rcx 16 | lea rcx, [rsp + 08h] 17 | call __capture_context 18 | jmp handler_guest_to_host -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Compiler and linker specified for hypervisor 3 | 4 | # Directories 5 | ROOTDIR := $(shell pwd) 6 | BUILDDIR := $(ROOTDIR)/build 7 | OBJDIR := $(BUILDDIR)/obj 8 | 9 | # Child makefile flags 10 | export ROOTDIR 11 | export OBJDIR 12 | export BUILDDIR 13 | 14 | # Configuration for making all files. 15 | .PHONY: all 16 | all: prep_dirs 17 | $(MAKE) -C hypervisor 18 | $(MAKE) -C examples 19 | 20 | # Configuration that creates directories needed 21 | prep_dirs: 22 | mkdir -p $(BUILDDIR) 23 | mkdir -p $(OBJDIR) 24 | 25 | # Cleaning of unneeded files. 26 | .PHONY: clean 27 | clean: 28 | rm -rf $(OBJDIR) 29 | rm -rf $(BUILDDIR) -------------------------------------------------------------------------------- /scripts/run-qemu-win10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TARGET_IMG=/vm/win10-uefi-dev.qcow2 4 | TARGET_IMG_SNAP=$TARGET_IMG.snap 5 | 6 | # Create a snapshot of the win10 image we want to use. 7 | rm -f $TARGET_IMG_SNAP 8 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP 9 | 10 | # To exit do Ctrl-A X 11 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 -d \ 12 | build/uefi.efi -f build/hypervisor.a -- \ 13 | -display gtk -enable-kvm -serial stdio -cpu host -smp 4 -m 8G \ 14 | -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \ 15 | -drive file=fat:rw:./build/ -------------------------------------------------------------------------------- /scripts/gdb-run-qemu-win10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TARGET_IMG=/vm/win10-uefi-dev.qcow2 4 | TARGET_IMG_SNAP=$TARGET_IMG.snap 5 | 6 | # Create a snapshot of the win10 image we want to use. 7 | rm -f $TARGET_IMG_SNAP 8 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP 9 | 10 | # To exit do Ctrl-A X 11 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 -d \ 12 | build/uefi.efi -f build/hypervisor.a -- \ 13 | -s -S \ 14 | -d int,cpu_reset \ 15 | -display gtk -enable-kvm -serial stdio -cpu host -smp 4 -m 8G \ 16 | -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \ 17 | -drive file=fat:rw:./build/ -------------------------------------------------------------------------------- /hypervisor/platform/spinlock.h: -------------------------------------------------------------------------------- 1 | #ifndef SPINLOCK_H 2 | #define SPINLOCK_H 3 | 4 | #include 5 | 6 | typedef int spinlock_t; 7 | 8 | static inline void spin_init(spinlock_t *lock) 9 | { 10 | *lock = 0; 11 | } 12 | 13 | static inline void spin_lock(spinlock_t *lock) 14 | { 15 | while (1) { 16 | int zero = 0; 17 | int one = 1; 18 | if (__atomic_compare_exchange(lock, &zero, &one, 0, 19 | __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) 20 | return; 21 | } 22 | } 23 | 24 | static inline void spin_unlock(spinlock_t *lock) 25 | { 26 | int zero = 0; 27 | __atomic_store(lock, &zero, __ATOMIC_SEQ_CST); 28 | } 29 | 30 | #endif /* SPINLOCK_H */ -------------------------------------------------------------------------------- /interface/usermode/hypervisor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "vmcall_if.h" 3 | 4 | /* 5 | * Defines a standard C++ class for interfacing with 6 | * the hypervisor. 7 | */ 8 | class hypervisor { 9 | 10 | /* Windows VEH registration and checking. 11 | * we don't have access to __try & __except 12 | * intrinsincs as is MSVC so we must use VEH. */ 13 | void register_exception_handler(); 14 | bool check_and_clear_exception(); 15 | 16 | /* Performs the sending of a VMCALL to the hypervisor. */ 17 | bool send_call(vmcall_param ¶m); 18 | public: 19 | hypervisor(); 20 | 21 | /* Hypervisor specific actions. */ 22 | bool check_presence(); 23 | bool load_plugin(std::string file_name); 24 | }; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | # Build directory 55 | build/* 56 | 57 | # IDE 58 | .vscode/* 59 | -------------------------------------------------------------------------------- /scripts/run-qemu-hvci-win10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TARGET_UEFI_IMAGE=$1 4 | TARGET_IMG=/vm/hvci-win10-uefi-dev.qcow2 5 | TARGET_IMG_SNAP=$TARGET_IMG.snap 6 | 7 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 8 | 9 | # Create a snapshot of the win10 image we want to use. 10 | rm -f $TARGET_IMG_SNAP 11 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP 12 | 13 | # To exit do Ctrl-A X 14 | $SCRIPT_DIR/../submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 $TARGET_UEFI_IMAGE -- \ 15 | -display gtk -enable-kvm -serial stdio -cpu host -smp 1 -m 8G \ 16 | -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \ 17 | -drive file=fat:rw:./build/ -------------------------------------------------------------------------------- /hypervisor/impl_hooks.h: -------------------------------------------------------------------------------- 1 | #ifndef IMPL_HOOKS_H 2 | #define IMPL_HOOKS_H 3 | 4 | /* 5 | * This header defines all hooks (aka missing code) that 6 | * the user of this library needs to implement. 7 | * 8 | * This is to allow the hypervisor to successfully compile. 9 | * 10 | * At time of linking of the library with the consuming application 11 | * if none of these hooks are implemented you will get an error, 12 | * aka implement them. 13 | */ 14 | #include "platform/standard.h" 15 | 16 | /* Used so the hypervisor can run a specific callback on each physical processor. */ 17 | bool impl_run_all_processors(__attribute__((ms_abi)) void (*callback)(void *opaque), void *opaque); 18 | 19 | /* Used for retrieving the current processor index. */ 20 | bool impl_get_processor_index(size_t *index); 21 | 22 | #endif /* IMPL_HOOKS_H */ -------------------------------------------------------------------------------- /hypervisor/platform/serial.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "serial.h" 4 | #include "arch.h" 5 | 6 | #define SERIAL_PORT 0x3F8 /* COM1 */ 7 | 8 | static bool is_tx_empty(void) 9 | { 10 | return (inb(SERIAL_PORT + 5) & 0x20) != 0; 11 | } 12 | 13 | static void print_char(char c) 14 | { 15 | while (!is_tx_empty()) {}; 16 | outb(SERIAL_PORT + 0, c); 17 | } 18 | 19 | void serial_init(void) 20 | { 21 | outb(SERIAL_PORT + 1, 0x00); /* Disable seiral interrupts. */ 22 | outb(SERIAL_PORT + 3, 0x80); /* Enable DLAB (set baud rate divisor). */ 23 | outb(SERIAL_PORT + 0, 0x01); /* Set divisor to 1 (lo byte) 115200/1 baud. */ 24 | outb(SERIAL_PORT + 1, 0x00); /* (hi byte) */ 25 | outb(SERIAL_PORT + 3, 0x03); /* 8 bits, no parity, one stop bit. */ 26 | outb(SERIAL_PORT + 2, 0xC7); /* Enable FIFO, clear them with 14-byte threshold. */ 27 | outb(SERIAL_PORT + 4, 0x0B); /* IRQs enabled, RTS/DSR set. */ 28 | outb(SERIAL_PORT + 4, 0x0F); 29 | 30 | /* Clear the screen & set home position. */ 31 | serial_print("\033[2J"); 32 | serial_print("\033[H"); 33 | } 34 | 35 | void serial_print(char *str) 36 | { 37 | for (int i = 0; str[i]; i++) { 38 | print_char(str[i]); 39 | } 40 | } -------------------------------------------------------------------------------- /hypervisor/Makefile: -------------------------------------------------------------------------------- 1 | FILE_NAME := $(shell basename `pwd`) 2 | 3 | OUT_OBJ_DIR := $(OBJDIR)/$(FILE_NAME) 4 | OUT_LIB_NAME := $(BUILDDIR)/$(FILE_NAME).a 5 | 6 | CC := gcc 7 | AR := ar 8 | 9 | # CDEFINES := CONFIG_NESTED 10 | 11 | CFLAGS := -fno-stack-protector \ 12 | -fcf-protection=none \ 13 | -mno-shstk \ 14 | -fdiagnostics-color \ 15 | -fshort-wchar \ 16 | -mno-sse \ 17 | -mno-red-zone \ 18 | -Wall \ 19 | -Wextra \ 20 | -Werror \ 21 | -I../submodules/ia32-doc/out/ \ 22 | -I../interface/hypervisor/ \ 23 | -I. \ 24 | -DPRINTF_DISABLE_SUPPORT_FLOAT \ 25 | $(addprefix -D, $(CDEFINES)) 26 | 27 | ASMFLAGS := -f elf64 \ 28 | -Werror 29 | 30 | # Use wildcards to gather all of the c files we need to find. 31 | C_FILES := $(shell find . -name '*.c') 32 | C_OBJ_FILES := $(patsubst %.c,%.o, $(C_FILES)) 33 | 34 | ASM_FILES := $(shell find . -name '*.asm') 35 | ASM_OBJ_FILES := $(patsubst %.asm,%.oasm, $(ASM_FILES)) 36 | 37 | .PHONY: all 38 | all: $(OUT_LIB_NAME) 39 | 40 | %.o: %.c 41 | mkdir -p $(OUT_OBJ_DIR)/$(dir $<) 42 | $(CC) -c -o $(OUT_OBJ_DIR)/$@ $< $(CFLAGS) 43 | 44 | %.oasm: %.asm 45 | mkdir -p $(OUT_OBJ_DIR)/$(dir $<) 46 | nasm $< -o $(OUT_OBJ_DIR)/$@ $(ASMFLAGS) 47 | 48 | $(OUT_LIB_NAME): $(C_OBJ_FILES) $(ASM_OBJ_FILES) 49 | $(AR) rcs $@ $(addprefix $(OUT_OBJ_DIR)/, $^) 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cascade 2 | ## A thin introspection hypervisor framework 3 | 4 | This framework runs a thin hypervisor utilising Intel VT-x to allow granular control of a system. 5 | 6 | As this is a introspection style hypervisor all system resources & devices are passed through to the 7 | "virtualised" system once initialised. Therefore any operating system of choice can then be loaded 8 | as if normal. 9 | 10 | Cascade gives the ability to register custom exit handlers for VMEXIT's generated by the guest as 11 | well as providing a custom VMCALL interface for registering and executing your own custom callbacks 12 | at VMROOT. 13 | 14 | An example project is given as a UEFI runtime driver, allowing early boot introspection/blue-pilling 15 | of a system. This UEFI runtime driver does not do much apart from load the hypervisor, however it 16 | is an example as to how this library can be utilised. 17 | 18 | ### Installation, Compilation & Testing (Ubuntu) 19 | 1. Ensure the following dependencies are installed on your system 20 | 21 | ```sudo apt-get install qemu qemu-utils ovmf gnu-efi binutils-mingw-w64 gcc-mingw-w64 xorriso mtools cargo``` 22 | ```cargo install uefi-run``` 23 | 24 | 2. To compile run a simple make command 25 | 26 | ```make -j $(nproc)``` 27 | 28 | 3. To run the build quickly in a QEMU instance use the EFI run tool (or use ```./run-qemu.sh```) 29 | 30 | ```uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/bin/qemu-system-x86_64 build/hypervisor.efi``` -------------------------------------------------------------------------------- /examples/uefi/elf_x86_64_efi.lds: -------------------------------------------------------------------------------- 1 | /* Same as elf_x86_64_fbsd_efi.lds, except for OUTPUT_FORMAT below - KEEP IN SYNC */ 2 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 3 | OUTPUT_ARCH(i386:x86-64) 4 | ENTRY(_start) 5 | SECTIONS 6 | { 7 | . = 0; 8 | ImageBase = .; 9 | /* .hash and/or .gnu.hash MUST come first! */ 10 | .hash : { *(.hash) } 11 | .gnu.hash : { *(.gnu.hash) } 12 | . = ALIGN(4096); 13 | .eh_frame : 14 | { 15 | *(.eh_frame) 16 | } 17 | . = ALIGN(4096); 18 | .text : 19 | { 20 | _text = .; 21 | *(.text) 22 | *(.text.*) 23 | *(.gnu.linkonce.t.*) 24 | . = ALIGN(16); 25 | } 26 | _etext = .; 27 | _text_size = . - _text; 28 | . = ALIGN(4096); 29 | .reloc : 30 | { 31 | *(.reloc) 32 | } 33 | . = ALIGN(4096); 34 | .data : 35 | { 36 | _data = .; 37 | *(.rodata*) 38 | *(.got.plt) 39 | *(.got) 40 | *(.data*) 41 | *(.sdata) 42 | } 43 | .note.gnu.build-id : { *(.note.gnu.build-id) } 44 | 45 | _edata = .; 46 | _data_size = . - _etext; 47 | . = ALIGN(4096); 48 | .bss : 49 | { 50 | *(.sbss) 51 | *(.scommon) 52 | *(.dynbss) 53 | *(.bss) 54 | *(COMMON) 55 | *(.rel.local) 56 | } 57 | . = ALIGN(4096); 58 | .dynamic : { *(.dynamic) } 59 | . = ALIGN(4096); 60 | .rela : 61 | { 62 | *(.rela.data*) 63 | *(.rela.got) 64 | *(.rela.stab) 65 | } 66 | . = ALIGN(4096); 67 | .dynsym : { *(.dynsym) } 68 | . = ALIGN(4096); 69 | .dynstr : { *(.dynstr) } 70 | . = ALIGN(4096); 71 | .ignored.reloc : 72 | { 73 | *(.rela.reloc) 74 | *(.eh_frame) 75 | *(.note.GNU-stack) 76 | } 77 | .comment 0 : { *(.comment) } 78 | } 79 | -------------------------------------------------------------------------------- /examples/uefi/Makefile: -------------------------------------------------------------------------------- 1 | FILE_NAME := $(shell basename `pwd`) 2 | 3 | OUT_OBJ_DIR := $(OBJDIR)/$(FILE_NAME) 4 | OUT_SO_NAME := $(OBJDIR)/$(FILE_NAME)/$(FILE_NAME).so 5 | OUT_EFI_NAME := $(BUILDDIR)/$(FILE_NAME).efi 6 | 7 | EFI_LIB_PATH := /usr/lib/ 8 | 9 | CC := gcc 10 | LD := ld 11 | OC := objcopy 12 | 13 | CFLAGS := -fno-stack-protector \ 14 | -fcf-protection=none \ 15 | -mno-shstk \ 16 | -fshort-wchar \ 17 | -mno-sse \ 18 | -mno-red-zone \ 19 | -Wall \ 20 | -Wextra \ 21 | -Werror \ 22 | -I/usr/include/efi \ 23 | -I/usr/include/efi/x86_64 \ 24 | -I../../submodules/ia32-doc/out/ \ 25 | -I../../interface/hypervisor/ \ 26 | -I. \ 27 | -DEFI_FUNCTION_WRAPPER \ 28 | 29 | LDFLAGS := $(EFI_LIB_PATH)crt0-efi-x86_64.o \ 30 | -shared \ 31 | -nostdlib \ 32 | -znocombreloc \ 33 | -T elf_x86_64_efi.lds \ 34 | -Bsymbolic \ 35 | -L $(EFI_LIB_PATH) \ 36 | -l:libgnuefi.a \ 37 | -l:libefi.a \ 38 | -L $(BUILDDIR) \ 39 | -l:hypervisor.a \ 40 | 41 | OCFLAGS := -j .text \ 42 | -j .sdata \ 43 | -j .data \ 44 | -j .bss \ 45 | -j .dynamic \ 46 | -j .dynsym \ 47 | -j .rel \ 48 | -j .rela \ 49 | -j .reloc \ 50 | --subsystem efi-rtd \ 51 | --target=efi-app-x86_64 \ 52 | 53 | # Use wildcards to gather all of the c files we need to find. 54 | C_FILES := $(shell find . -name '*.c') 55 | C_OBJ_FILES := $(patsubst %.c,%.o, $(C_FILES)) 56 | 57 | .PHONY: all 58 | all: $(OUT_EFI_NAME) 59 | 60 | %.o: %.c 61 | mkdir -p $(OUT_OBJ_DIR)/$(dir $<) 62 | $(CC) -c -o $(OUT_OBJ_DIR)/$@ $< $(CFLAGS) 63 | 64 | $(OUT_SO_NAME): $(C_OBJ_FILES) 65 | $(LD) -o $@ $(addprefix $(OUT_OBJ_DIR)/, $^) $(LDFLAGS) 66 | 67 | $(OUT_EFI_NAME): $(OUT_SO_NAME) 68 | rm -f $(OUT_EFI_NAME) 69 | $(OC) $(OCFLAGS) $^ $@ -------------------------------------------------------------------------------- /hypervisor/hypervisor.c: -------------------------------------------------------------------------------- 1 | #include "platform/standard.h" 2 | #include "platform/serial.h" 3 | #include "memory/pmem.h" 4 | #include "memory/vmem.h" 5 | #include "interrupt/idt.h" 6 | #include "vmm/vmm.h" 7 | 8 | static void trigger_cpuid(void) 9 | { 10 | /* Running a CPUID should trigger an exit. */ 11 | uint64_t ticks_before = __rdtsc(); 12 | 13 | uint64_t rax, rbx, rcx, rdx; 14 | asm volatile ( 15 | "movl $0x40000000, %%eax;" 16 | "cpuid;" 17 | : "=a"(rax), "=b"(rbx), "=c"(rcx), "=d"(rdx) 18 | ); 19 | uint64_t ticks_delta = __rdtsc() - ticks_before; 20 | 21 | debug_print("Test CPUID leaf=0x40000000 eax=0x%lX ebx=0x%lX ecx=0x%lX edx=0x%lX ticks=%ld", 22 | rax, rbx, rcx, rdx, ticks_delta); 23 | } 24 | 25 | static void test_rdmsr(void) 26 | { 27 | /* Reading MSRs may trigger a VM exit, if set in the bitmap. */ 28 | uint64_t ticks_before = __rdtsc(); 29 | uint64_t dummy_msr = rdmsr(IA32_TIME_STAMP_COUNTER); 30 | uint64_t ticks_delta = __rdtsc() - ticks_before; 31 | 32 | debug_print("Test RDMSR dummy_val=0x%lX ticks=%ld", dummy_msr, ticks_delta); 33 | } 34 | 35 | static void hypervisor_tests(void) 36 | { 37 | trigger_cpuid(); 38 | test_rdmsr(); 39 | } 40 | 41 | void hypervisor_init(void) 42 | { 43 | 44 | //#define DEBUG_IDA 45 | #ifdef DEBUG_IDA 46 | static volatile int wait_debug = 0; 47 | 48 | while (!wait_debug) {} 49 | #endif 50 | 51 | /* Initialise all of the required modules and set up the parameters 52 | * required for the VMM to start. */ 53 | struct vmm_init_params vmm_params = { 0 }; 54 | 55 | serial_init(); 56 | pmem_init(); 57 | vmem_init(&vmm_params.guest_cr3, &vmm_params.host_cr3); 58 | idt_init(&vmm_params.guest_idtr, &vmm_params.host_idtr); 59 | vmm_init(&vmm_params); 60 | 61 | hypervisor_tests(); 62 | 63 | debug_print("Hypervisor initialised."); 64 | } -------------------------------------------------------------------------------- /hypervisor/platform/arch.h: -------------------------------------------------------------------------------- 1 | #ifndef ARCH_H 2 | #define ARCH_H 3 | 4 | #include 5 | #include 6 | 7 | /* Segment indexes. */ 8 | enum seg_idx{ 9 | SEG_ES = 0, 10 | SEG_CS, 11 | SEG_SS, 12 | SEG_DS, 13 | SEG_FS, 14 | SEG_GS 15 | }; 16 | 17 | /* CPUID ease of use. */ 18 | struct cpuid_leaf_output { 19 | uint32_t eax; 20 | uint32_t ebx; 21 | uint32_t ecx; 22 | uint32_t edx; 23 | }; 24 | 25 | /* This has been created so that the ia_32_compact.h defines for CPUID leafs can be used 26 | * without having to manually reconstruct each of the variables one by one. */ 27 | #define CPUID_LEAF_READ(leaf, output) __get_cpuid(leaf, &output.eax.flags, &output.ebx.flags, \ 28 | &output.ecx.flags, &output.edx.flags) 29 | 30 | /* MSR and IO port handling helpers. */ 31 | static inline uint64_t rdmsr(uint64_t msr) 32 | { 33 | uint32_t low, high; 34 | asm volatile ( 35 | "rdmsr" 36 | : "=a"(low), "=d"(high) 37 | : "c"(msr) 38 | ); 39 | return ((uint64_t)high << 32) | low; 40 | } 41 | 42 | static inline void wrmsr(uint64_t msr, uint64_t value) 43 | { 44 | uint32_t low = value & 0xFFFFFFFF; 45 | uint32_t high = value >> 32; 46 | asm volatile ( 47 | "wrmsr" 48 | : 49 | : "c"(msr), "a"(low), "d"(high) 50 | ); 51 | } 52 | 53 | static inline void outb(uint16_t port, uint8_t val) 54 | { 55 | asm volatile ( "outb %0, %1" : : "a"(val), "Nd"(port) ); 56 | } 57 | 58 | static inline uint8_t inb(uint16_t port) 59 | { 60 | uint8_t ret; 61 | asm volatile ( "inb %1, %0" 62 | : "=a"(ret) 63 | : "Nd"(port) ); 64 | return ret; 65 | } 66 | 67 | /* Architecture specific register defines. */ 68 | #define CR4_VMXE_SHIFT 13ull 69 | #define CR4_VMXE_MASK (1ull << CR4_VMXE_SHIFT) 70 | #define CR4_LA57_SHIFT 12ull 71 | #define CR4_LA57_MASK (1ull << CR4_LA57_SHIFT) 72 | 73 | #endif /* ARCH_H */ -------------------------------------------------------------------------------- /examples/uefi/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "hypervisor.h" 7 | #include "mp_service.h" 8 | 9 | static EFI_MP_SERVICES_PROTOCOL *mp_protocol = NULL; 10 | 11 | /* 12 | * Definitions of the callback hooks that are required to be implemented 13 | * by the system. In UEFI we implement these via EFI_MP_SERVICES. 14 | */ 15 | bool impl_run_all_processors(__attribute__((ms_abi)) void (*callback)(void *opaque), void *opaque) 16 | { 17 | EFI_STATUS status; 18 | UINTN proc_count; 19 | UINTN enabled_procs; 20 | 21 | status = uefi_call_wrapper(mp_protocol->GetNumberOfProcessors, 3, 22 | mp_protocol, &proc_count, &enabled_procs); 23 | if (status) 24 | return false; 25 | 26 | /* Call on this processor first. */ 27 | callback(opaque); 28 | 29 | /* Call on other processors now. */ 30 | if (enabled_procs > 1) { 31 | status = uefi_call_wrapper(mp_protocol->StartupAllAPs, 7, 32 | mp_protocol, (EFI_AP_PROCEDURE)callback, 33 | true, NULL, 0, opaque, NULL); 34 | if (status) 35 | return false; 36 | } 37 | 38 | return true; 39 | } 40 | 41 | bool impl_get_processor_index(size_t *index) 42 | { 43 | EFI_STATUS status = uefi_call_wrapper(mp_protocol->WhoAmI, 2, mp_protocol, (UINTN *)index); 44 | return (status == EFI_SUCCESS); 45 | } 46 | 47 | EFI_STATUS EFIAPI efi_main (EFI_HANDLE image_handle, EFI_SYSTEM_TABLE *system_table) 48 | { 49 | static const EFI_GUID MP_GUID = EFI_MP_SERVICES_PROTOCOL_GUID; 50 | 51 | InitializeLib(image_handle, system_table); 52 | 53 | /* Locate the MP protocol so we can fill in our hooks for the hypervisor. */ 54 | EFI_STATUS status = uefi_call_wrapper(system_table->BootServices->LocateProtocol, 3, 55 | &MP_GUID, NULL, &mp_protocol); 56 | 57 | if (status) 58 | return status; 59 | 60 | 61 | hypervisor_init(); 62 | 63 | return EFI_SUCCESS; 64 | } -------------------------------------------------------------------------------- /hypervisor/platform/intrin.h: -------------------------------------------------------------------------------- 1 | #ifndef INTRIN_H 2 | #define INTRIN_H 3 | 4 | #include 5 | #include "platform/standard.h" 6 | 7 | extern __attribute__((ms_abi)) uint16_t __readcs(); 8 | extern __attribute__((ms_abi)) uint64_t __readcr0(); 9 | extern __attribute__((ms_abi)) uint64_t __readcr2(); 10 | extern __attribute__((ms_abi)) uint64_t __readcr3(); 11 | extern __attribute__((ms_abi)) uint64_t __readcr4(); 12 | extern __attribute__((ms_abi)) uint64_t __readdr7(); 13 | extern __attribute__((ms_abi)) uint64_t __rdtsc(void); 14 | extern __attribute__((ms_abi)) void __writecr0(uint64_t cr0); 15 | extern __attribute__((ms_abi)) void __writecr3(uint64_t cr3); 16 | extern __attribute__((ms_abi)) void __writecr4(uint64_t cr4); 17 | extern __attribute__((ms_abi)) void __lidt(void *idt); 18 | extern __attribute__((ms_abi)) void __sidt(void *idt); 19 | extern __attribute__((ms_abi)) void __lgdt(void *gdt); 20 | extern __attribute__((ms_abi)) void __sgdt(void *gdt); 21 | extern __attribute__((ms_abi)) void __lldt(void *ldt); 22 | extern __attribute__((ms_abi)) void __sldt(void *ldt); 23 | extern __attribute__((ms_abi)) void __str(void *tr); 24 | extern __attribute__((ms_abi)) void __ltr(void *tr); 25 | extern __attribute__((ms_abi)) void __xsetbv(uint64_t field, uint64_t val); 26 | extern __attribute__((ms_abi)) void __invd(void); 27 | extern __attribute__((ms_abi)) void __invlpg(uintptr_t *addr); 28 | extern __attribute__((ms_abi)) void __invept(uint64_t ext, void *addr); 29 | extern __attribute__((ms_abi)) int __vmxon(void *vmxon); 30 | extern __attribute__((ms_abi)) int __vmclear(void *vmcs); 31 | extern __attribute__((ms_abi)) int __vmptrld(void *vmcs); 32 | extern __attribute__((ms_abi)) void __vmwrite(size_t field, size_t value); 33 | extern __attribute__((ms_abi)) size_t __vmread(size_t field); 34 | extern __attribute__((ms_abi)) int __vmlaunch(void); 35 | extern __attribute__((ms_abi, noreturn)) void __vmresume(void); 36 | extern __attribute__((ms_abi)) void __capture_context(void *context); 37 | extern __attribute__((ms_abi)) void __restore_context(void *context); 38 | 39 | #endif /* INTRIN_H */ -------------------------------------------------------------------------------- /hypervisor/interrupt/idt.asm: -------------------------------------------------------------------------------- 1 | ; Based upon Satoshi Tandasat's IDT handling stub/asm file. 2 | 3 | section .text 4 | 5 | extern idt_exception_handler 6 | 7 | %macro pushaq 0 8 | push rax 9 | push rcx 10 | push rdx 11 | push rbx 12 | push rbp 13 | push rsi 14 | push rdi 15 | push r8 16 | push r9 17 | push r10 18 | push r11 19 | push r12 20 | push r13 21 | push r14 22 | push r15 23 | %endmacro 24 | 25 | %macro popaq 0 26 | pop r15 27 | pop r14 28 | pop r13 29 | pop r12 30 | pop r11 31 | pop r10 32 | pop r9 33 | pop r8 34 | pop rdi 35 | pop rsi 36 | pop rbp 37 | pop rbx 38 | pop rdx 39 | pop rcx 40 | pop rax 41 | %endmacro 42 | 43 | %macro isr_err_stub 1 44 | isr_stub_%+%1: 45 | cli 46 | push %1 47 | jmp common_exception_handler 48 | %endmacro 49 | 50 | %macro isr_no_err_stub 1 51 | isr_stub_%+%1: 52 | cli 53 | push 0 54 | push %1 55 | jmp common_exception_handler 56 | %endmacro 57 | 58 | isr_no_err_stub 0 59 | isr_no_err_stub 1 60 | isr_no_err_stub 2 61 | isr_no_err_stub 3 62 | isr_no_err_stub 4 63 | isr_no_err_stub 5 64 | isr_no_err_stub 6 65 | isr_no_err_stub 7 66 | isr_err_stub 8 67 | isr_no_err_stub 9 68 | isr_err_stub 10 69 | isr_err_stub 11 70 | isr_err_stub 12 71 | isr_err_stub 13 72 | isr_err_stub 14 73 | isr_no_err_stub 15 74 | isr_no_err_stub 16 75 | isr_err_stub 17 76 | isr_no_err_stub 18 77 | isr_no_err_stub 19 78 | isr_no_err_stub 20 79 | isr_no_err_stub 21 80 | isr_no_err_stub 22 81 | isr_no_err_stub 23 82 | isr_no_err_stub 24 83 | isr_no_err_stub 25 84 | isr_no_err_stub 26 85 | isr_no_err_stub 27 86 | isr_no_err_stub 28 87 | isr_no_err_stub 29 88 | isr_err_stub 30 89 | isr_no_err_stub 31 90 | %assign i 32 91 | %rep 224 92 | isr_no_err_stub i 93 | %assign i i+1 94 | %endrep 95 | 96 | common_exception_handler: 97 | pushaq 98 | mov rdi, rsp 99 | call idt_exception_handler 100 | popaq 101 | add rsp, 10h 102 | sti 103 | iretq 104 | 105 | section .data 106 | global interrupt_vector_table 107 | interrupt_vector_table: 108 | %assign i 0 109 | %rep 256 110 | dq isr_stub_%+i ; Using DQ as we are x64 111 | %assign i i+1 112 | %endrep -------------------------------------------------------------------------------- /interface/hypervisor/vmcall_if.h: -------------------------------------------------------------------------------- 1 | #ifndef VMCALL_IF_H 2 | #define VMCALL_IF_H 3 | 4 | /* 5 | * As cascade is an introspection framework we want to be able to 6 | * control the introspection + host capabilities from within 7 | * guest applications. 8 | * 9 | * This VMCALL interface gives applications within the guest 10 | * rudimentary ability to perform such actions. 11 | */ 12 | #include 13 | #include 14 | #include 15 | 16 | /* 17 | * Secret key which the guest needs to utilise to allow for 18 | * accessing the VMCALL interface. 19 | */ 20 | #define VMCALL_SECRET_KEY ((size_t)0x0CA5CADE) 21 | 22 | /* 23 | * Generic VMCALL actions that the hypervisor provides. 24 | */ 25 | #define VMCALL_ACTION_CHECK_PRESENCE (0ull) 26 | 27 | /* Definition of an action identifier for a VMCALL. */ 28 | typedef size_t vmcall_id_t; 29 | 30 | /* Definition of a return status code for a VMCALL. */ 31 | typedef size_t vmcall_status_t; 32 | 33 | #define VMCALL_STATUS_OK 0ull 34 | #define VMCALL_STATUS_INVALID_PARAM 1ull 35 | #define VMCALL_STATUS_INVALID_ID 2ull 36 | #define VMCALL_STATUS_INTERNAL_ERROR 3ull 37 | 38 | /* 39 | * Definition of a VMCALL exit handler callback. 40 | * buffer is the HOST copy of the buffer provided in 41 | * the vmcall_param. Upon completion of the callback 42 | * this will then get copied back into guest context. 43 | * 44 | * Opaque is whatever was passed in when registering 45 | * the VMCALL event by the hypervisor. 46 | * 47 | * Return value size_t is a status code for the VMCALL. 48 | */ 49 | typedef vmcall_status_t (*vmcall_cbk_t)(uint8_t *buffer, void *opaque); 50 | 51 | /* 52 | * Definition of the parameter struct a guest uses when performing a 53 | * VMCALL to the hypervisor. 54 | */ 55 | struct vmcall_param { 56 | /* The unique identifier of the action to call. */ 57 | vmcall_id_t id; 58 | /* 59 | * Extra buffer space for a vmcall parameter. 60 | * This can be utilised for storing extra data 61 | * to be communicated between host <-> guest on 62 | * the VMCALL. Statically fixed to this size to 63 | * just make life easier when dealing with reading 64 | * memory to prevent having to alloc & free all 65 | * the time. 66 | */ 67 | uint8_t buffer[4096]; 68 | }; 69 | 70 | void vmcall_register_action(struct vmcall_ctx *ctx, 71 | vmcall_id_t id, 72 | vmcall_cbk_t callback, 73 | void *opaque); 74 | 75 | #endif /* VMCALL_IF_H */ -------------------------------------------------------------------------------- /interface/usermode/hypervisor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "hypervisor.hpp" 6 | 7 | static bool g_call_fail = false; 8 | 9 | static long exception_handler(_EXCEPTION_POINTERS *info) 10 | { 11 | g_call_fail = true; 12 | std::cout << "Exception handler called.\n"; 13 | return EXCEPTION_CONTINUE_EXECUTION; 14 | } 15 | 16 | void hypervisor::register_exception_handler() 17 | { 18 | AddVectoredExceptionHandler(1 /* FIRST */, exception_handler); 19 | } 20 | 21 | bool hypervisor::check_and_clear_exception() 22 | { 23 | bool result = g_call_fail; 24 | g_call_fail = false; 25 | return result; 26 | } 27 | 28 | bool hypervisor::send_call(vmcall_param ¶m) 29 | { 30 | /* 31 | * Send the action via the VMCALL interface and check 32 | * the VEH global g_call_fail to see whether the HW 33 | * responded with a #UD or other fault. 34 | */ 35 | size_t result; 36 | asm volatile 37 | ( 38 | "vmcall\n\t" 39 | : "=a"(result) 40 | : "c"(VMCALL_SECRET_KEY), "d"(¶m) 41 | ); 42 | 43 | return !check_and_clear_exception() && (result == 0); 44 | } 45 | 46 | bool hypervisor::check_presence() 47 | { 48 | /* 49 | * As it's possible the cascade hypervisor is 50 | * hiding it's presence we use the defined VMCALL 51 | * interface with secret key to query rather than 52 | * attempting to check presence via VMXE or CPUID 53 | * hypervisor leafs. 54 | */ 55 | vmcall_param param = {}; 56 | param.action = ACTION_CHECK_PRESENCE; 57 | return send_call(param); 58 | } 59 | 60 | bool hypervisor::load_plugin(std::string file_name) 61 | { 62 | /* Load the plugin into this process dynamically then store image start. */ 63 | HMODULE handle_plugin = LoadLibraryA(file_name.c_str()); 64 | if (!handle_plugin) { 65 | std::cout << "Unable to load " << file_name << " into plugin loader.\n"; 66 | } 67 | 68 | /* 69 | * It is not guaranteed that every page within the loaded image is currently 70 | * mapped into the process due to paging. As the hypervisor CANNOT deal with 71 | * paged out pages we attempt to read every page to attempt to get them all 72 | * present within memory. 73 | * 74 | * We can guarantee that every page within a DLL is readable. 75 | */ 76 | uint8_t *raw_image = reinterpret_cast(handle_plugin); 77 | PIMAGE_DOS_HEADER idh = reinterpret_cast(raw_image); 78 | PIMAGE_NT_HEADERS inh = reinterpret_cast(&raw_image[idh->e_lfanew]); 79 | 80 | for (size_t i = 0; i < inh->OptionalHeader.SizeOfImage; i += 0x1000) { 81 | /* Perform a bogus read of the page, using the if Sleep to prevent 82 | * optimisation. */ 83 | if (raw_image[i]) 84 | Sleep(0); 85 | } 86 | 87 | /* Set up the plugin loading action pointing to raw plugin bytes + size. */ 88 | vmcall_param_load_plugin plugin_param = {}; 89 | plugin_param.plugin = raw_image; 90 | 91 | /* Set up the main vmcall action pointing to our plugin parameters. */ 92 | vmcall_param param = {}; 93 | param.action = ACTION_LOAD_PLUGIN; 94 | param.param = &plugin_param; 95 | param.param_size = sizeof(plugin_param); 96 | return send_call(param); 97 | } 98 | 99 | hypervisor::hypervisor() 100 | { 101 | /* Register VEH so we can catch #UD's on send failure. */ 102 | register_exception_handler(); 103 | } -------------------------------------------------------------------------------- /hypervisor/platform/standard.h: -------------------------------------------------------------------------------- 1 | #ifndef STANDARD_H 2 | #define STANDARD_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "printf/printf.h" 10 | #include "arch.h" 11 | #include "serial.h" 12 | #include "spinlock.h" 13 | #include "intrin.h" 14 | 15 | /* Size definitions */ 16 | #define GiB(x) ((size_t)(x) << 30) 17 | #define MiB(x) ((size_t)(x) << 20) 18 | #define KiB(x) ((size_t)(x) << 10) 19 | 20 | /* Architecture definitions. */ 21 | #define PAGE_SIZE 0x1000 22 | #define PAGE_MASK (PAGE_SIZE - 1) 23 | 24 | /* Paging specific level masks. */ 25 | #define ADDRMASK_PML4_INDEX(addr) (((size_t)addr & 0xFF8000000000ULL) >> 39) 26 | #define ADDRMASK_PDPTE_INDEX(addr) (((size_t)addr & 0x7FC0000000ULL) >> 30) 27 | #define ADDRMASK_PDE_INDEX(addr) (((size_t)addr & 0x3FE00000ULL) >> 21) 28 | #define ADDRMASK_PTE_INDEX(addr) (((size_t)addr & 0x1FF000ULL) >> 12) 29 | 30 | #define ADDRMASK_PDPTE_OFFSET(addr) ((size_t)addr & 0x3FFFFFFFULL) 31 | #define ADDRMASK_PDE_OFFSET(addr) ((size_t)addr & 0x1FFFFFULL) 32 | #define ADDRMASK_PTE_OFFSET(addr) ((size_t)addr & 0xFFFULL) 33 | 34 | /* EPT/SLAT specific level masks. */ 35 | #define ADDRMASK_EPT_PML4_INDEX(addr) (((size_t)addr & 0xFF8000000000ULL) >> 39) 36 | #define ADDRMASK_EPT_PML3_INDEX(addr) (((size_t)addr & 0x7FC0000000ULL) >> 30) 37 | #define ADDRMASK_EPT_PML2_INDEX(addr) (((size_t)addr & 0x3FE00000ULL) >> 21) 38 | #define ADDRMASK_EPT_PML1_INDEX(addr) (((size_t)addr & 0x1FF000ULL) >> 12) 39 | #define ADDRMASK_EPT_PML1_OFFSET(addr) ((size_t)addr & 0xFFFULL) 40 | 41 | /* Utility macros */ 42 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 43 | #define NUMBER_BITS_TYPE(type) (sizeof(type) * 8) 44 | 45 | typedef uintptr_t gpa_t; 46 | typedef uintptr_t hva_t; 47 | 48 | static inline void wait_for_debugger(void) 49 | { 50 | /* 51 | * Just some crappy while != 0 loop which the debugger must explicitly 52 | * clear. This is ONLY to be used when we need to attach with GDB and 53 | * figure something out. Should not be used for anything else. 54 | * 55 | * When the debugger reaches this point, you can then simply run: 56 | * "set $eax=0" to step through and continue with what you need. 57 | * 58 | * cli/sti are used to prevent any interrupts during waiting. 59 | */ 60 | static volatile int wait_clear; 61 | 62 | wait_clear = 1; 63 | 64 | asm volatile ("cli"); 65 | 66 | while (wait_clear) 67 | asm volatile ( "pause" ); 68 | 69 | asm volatile ("sti"); 70 | } 71 | 72 | /* Debug printing */ 73 | static inline void print_buffer(const char *format, ...) 74 | { 75 | static spinlock_t sync_lock = 0; 76 | va_list marker; 77 | char tmp_buff[512] = { 0 }; 78 | 79 | spin_lock(&sync_lock); 80 | va_start(marker, format); 81 | vsnprintf(tmp_buff, sizeof(tmp_buff), format, marker); 82 | va_end(marker); 83 | serial_print(tmp_buff); 84 | spin_unlock(&sync_lock); 85 | } 86 | 87 | #define debug_print(format, ...) \ 88 | do { \ 89 | char tmp_buff[512] = { 0 }; \ 90 | snprintf(tmp_buff, sizeof(tmp_buff), "[0x%lX] %s %s (L%04d) - %s \r\n", __rdtsc(), __FILE__, __func__, __LINE__, format); \ 91 | print_buffer(tmp_buff, ##__VA_ARGS__); \ 92 | } while (0) 93 | 94 | #define die_on(cond, ...) do { \ 95 | if (cond) { \ 96 | debug_print(__VA_ARGS__); \ 97 | while (1) {} \ 98 | } \ 99 | } while (0) 100 | 101 | #define assert(cond) die_on(!(cond), "assertion failed."); 102 | 103 | #ifdef DEBUG_MODULE 104 | #define DEBUG_PRINT(...) debug_print(__VA_ARGS__) 105 | #else 106 | #define DEBUG_PRINT(...) 107 | #endif 108 | 109 | 110 | #endif /* STANDARD_H */ -------------------------------------------------------------------------------- /hypervisor/vmm/vmm_reg.h: -------------------------------------------------------------------------------- 1 | #ifndef VMM_REG_H 2 | #define VMM_REG_H 3 | 4 | #include "ia32_compact.h" 5 | 6 | struct control_registers { 7 | cr0 reg_cr0; 8 | cr3 reg_cr3; 9 | cr4 reg_cr4; 10 | uintptr_t gs_base; 11 | ia32_debugctl_register debugctl; 12 | uintptr_t dr7; 13 | }; 14 | 15 | struct __attribute__ ((aligned (16))) m128a { 16 | uint64_t low; 17 | int64_t high; 18 | }; 19 | 20 | struct __attribute__ ((aligned (16))) xmm_save_area32 21 | { 22 | uint16_t control_word; 23 | uint16_t status_word; 24 | uint8_t tag_word; 25 | uint8_t reserved1; 26 | uint16_t error_opcode; 27 | uint32_t error_offset; 28 | uint16_t error_selector; 29 | uint16_t reserved2; 30 | uint32_t data_offset; 31 | uint16_t data_selector; 32 | uint16_t reserved3; 33 | uint32_t mx_csr; 34 | uint32_t mx_csr_mask; 35 | struct m128a float_registers[8]; 36 | struct m128a xmm_registers[16]; 37 | uint8_t reserved4[96]; 38 | }; 39 | 40 | struct __attribute__ ((aligned (16))) vcpu_context { 41 | uint64_t p1_home; 42 | uint64_t p2_home; 43 | uint64_t p3_home; 44 | uint64_t p4_home; 45 | uint64_t p5_home; 46 | uint64_t p6_home; 47 | uint32_t context_flags; 48 | uint32_t mx_csr; 49 | uint16_t seg_cs; 50 | uint16_t seg_ds; 51 | uint16_t seg_es; 52 | uint16_t seg_fs; 53 | uint16_t seg_gs; 54 | uint16_t seg_ss; 55 | uint32_t e_flags; 56 | uint64_t dr0; 57 | uint64_t dr1; 58 | uint64_t dr2; 59 | uint64_t dr3; 60 | uint64_t dr6; 61 | uint64_t dr7; 62 | uint64_t rax; 63 | uint64_t rcx; 64 | uint64_t rdx; 65 | uint64_t rbx; 66 | uint64_t rsp; 67 | uint64_t rbp; 68 | uint64_t rsi; 69 | uint64_t rdi; 70 | uint64_t r8; 71 | uint64_t r9; 72 | uint64_t r10; 73 | uint64_t r11; 74 | uint64_t r12; 75 | uint64_t r13; 76 | uint64_t r14; 77 | uint64_t r15; 78 | uint64_t rip; 79 | union 80 | { 81 | struct xmm_save_area32 flt_save; 82 | struct 83 | { 84 | struct m128a header[2]; 85 | struct m128a legacy[8]; 86 | struct m128a xmm0; 87 | struct m128a xmm1; 88 | struct m128a xmm2; 89 | struct m128a xmm3; 90 | struct m128a xmm4; 91 | struct m128a xmm5; 92 | struct m128a xmm6; 93 | struct m128a xmm7; 94 | struct m128a xmm8; 95 | struct m128a xmm9; 96 | struct m128a xmm10; 97 | struct m128a xmm11; 98 | struct m128a xmm12; 99 | struct m128a xmm13; 100 | struct m128a xmm14; 101 | struct m128a xmm15; 102 | }; 103 | }; 104 | struct m128a vector_register[26]; 105 | uint64_t vector_control; 106 | uint64_t debug_control; 107 | uint64_t last_branch_to_rip; 108 | uint64_t last_branch_from_rip; 109 | uint64_t last_exception_to_rip; 110 | uint64_t last_exception_from_rip; 111 | }; 112 | 113 | #pragma pack(push, 1) 114 | struct task_state_segment_64 115 | { 116 | uint32_t reserved0; 117 | uint64_t rsp0; 118 | uint64_t rsp1; 119 | uint64_t rsp2; 120 | uint64_t reserved1; 121 | uint64_t ist[7]; 122 | uint64_t reserved3; 123 | uint16_t reserved4; 124 | uint16_t io_map_base_address; 125 | }; 126 | #pragma pack(pop) 127 | 128 | struct gdt_config { 129 | __attribute__ ((aligned (16))) segment_descriptor_64 host_gdt[32]; 130 | __attribute__ ((aligned (16))) segment_descriptor_register_64 guest_gdtr; 131 | __attribute__ ((aligned (16))) segment_descriptor_register_64 host_gdtr; 132 | segment_selector guest_ldtr; 133 | segment_selector host_tr; 134 | struct task_state_segment_64 host_tss; 135 | }; 136 | 137 | #endif /* VMM_REG_H */ -------------------------------------------------------------------------------- /hypervisor/interrupt/idt.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "platform/intrin.h" 4 | #include "vmm/vmm_common.h" 5 | #include "idt.h" 6 | 7 | struct idt_entry { 8 | uint16_t offset_15_to_0; 9 | uint16_t segment_selector; 10 | uint8_t ist : 3; 11 | uint8_t reserved_0 : 5; 12 | uint8_t gate_type : 4; 13 | uint8_t reserved_1 : 1; 14 | uint8_t dpl : 2; 15 | uint8_t present : 1; 16 | uint16_t offset_31_to_16; 17 | uint32_t offset_63_to_32; 18 | uint32_t reserved_2; 19 | } __attribute__((packed)); 20 | 21 | struct exception_stack { 22 | uint64_t r15; 23 | uint64_t r14; 24 | uint64_t r13; 25 | uint64_t r12; 26 | uint64_t r11; 27 | uint64_t r10; 28 | uint64_t r9; 29 | uint64_t r8; 30 | uint64_t rdi; 31 | uint64_t rsi; 32 | uint64_t rbp; 33 | uint64_t rbx; 34 | uint64_t rdx; 35 | uint64_t rcx; 36 | uint64_t rax; 37 | uint64_t interrupt_number; 38 | uint64_t error_code; 39 | uint64_t rip; 40 | uint64_t cs; 41 | rfl r_flags; 42 | }; 43 | 44 | #define IDT_ENTRY_COUNT 256 45 | 46 | /* The IDT handler function, this is written in NASM rather 47 | * than in C as we need full control of what goes on. */ 48 | extern void *interrupt_vector_table[]; 49 | 50 | /* The descriptor table that holds an IDT entry for each vector. */ 51 | __attribute__((aligned(0x10))) static struct idt_entry idt_table[IDT_ENTRY_COUNT] = { 0 }; 52 | 53 | /* Holds any interrupts caught in HOST that need to be forwarded to guest. */ 54 | struct cached_interrupt cached_int = { 0 }; 55 | 56 | static void set_entry(uint8_t vector, void *isr, uint8_t gate_type) 57 | { 58 | struct idt_entry *entry = &idt_table[vector]; 59 | 60 | entry->offset_15_to_0 = (uint16_t)((uintptr_t)isr); 61 | entry->segment_selector = __readcs(); 62 | entry->ist = 0; 63 | entry->reserved_0 = 0; 64 | entry->gate_type = gate_type; 65 | entry->reserved_1 = 0; 66 | entry->dpl = 0; 67 | entry->present = true; 68 | entry->offset_31_to_16 = (uint16_t)((uintptr_t)isr >> 16); 69 | entry->offset_63_to_32 = (uint32_t)((uintptr_t)isr >> 32); 70 | entry->reserved_2 = 0; 71 | } 72 | 73 | /* The exception handler that the common IDT stub function will call. */ 74 | void idt_exception_handler(const struct exception_stack *stack) 75 | { 76 | (void)stack; 77 | 78 | /* If it is an interrupt that is device specific we should deal with this properly. */ 79 | die_on(stack->interrupt_number < 0x20, "Unhandled interrupt rip %lX vec 0x%X[%d] err 0x%X cr2=0x%lX", 80 | stack->rip, 81 | stack->interrupt_number, 82 | stack->interrupt_number, 83 | stack->error_code, __readcr2()); 84 | 85 | /* 86 | * Set the pending interrupt within the VMM, on this vCPU's next 87 | * VMENTER the interrupt will be delivered to the guest. 88 | */ 89 | exception_error_code ec = { 0 }; 90 | ec.index = (uint32_t)stack->error_code; 91 | vmm_set_cached_interrupt((exception_vector)stack->interrupt_number, ec); 92 | } 93 | 94 | void idt_init(segment_descriptor_register_64 *orig_idtr, segment_descriptor_register_64 *new_idtr) 95 | { 96 | /* Store the original IDTR. */ 97 | __sidt(orig_idtr); 98 | DEBUG_PRINT("Original IDTR base_addr %lX limit %X", 99 | orig_idtr->base_address, orig_idtr->limit); 100 | 101 | /* Create the IDTR. */ 102 | new_idtr->base_address = (uintptr_t)&idt_table[0]; 103 | new_idtr->limit = (uint16_t)sizeof(struct idt_entry) * IDT_ENTRY_COUNT - 1; 104 | 105 | /* Fill out all of the IDT entries with their relevant stubs. */ 106 | for (int i = 0; i < IDT_ENTRY_COUNT; i++) { 107 | set_entry(i, interrupt_vector_table[i], SEGMENT_DESCRIPTOR_TYPE_INTERRUPT_GATE); 108 | } 109 | 110 | DEBUG_PRINT("New IDTR base_addr %lX limit %X", 111 | new_idtr->base_address, new_idtr->limit); 112 | } 113 | -------------------------------------------------------------------------------- /hypervisor/platform/nt.h: -------------------------------------------------------------------------------- 1 | #ifndef NT_H 2 | #define NT_H 3 | 4 | #include "standard.h" 5 | 6 | /* Core Windows X64 defines, this is included as will be used for plugins. */ 7 | #pragma pack(push, 1) 8 | 9 | #define IMAGE_DOS_SIGNATURE 0x5A4D 10 | #define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x20b 11 | #define IMAGE_NUMBEROF_DIRECTORY_ENTRIES 16 12 | 13 | #define IMAGE_DIRECTORY_ENTRY_EXPORT 0 // Export Directory 14 | #define IMAGE_DIRECTORY_ENTRY_IMPORT 1 // Import Directory 15 | #define IMAGE_DIRECTORY_ENTRY_RESOURCE 2 // Resource Directory 16 | #define IMAGE_DIRECTORY_ENTRY_EXCEPTION 3 // Exception Directory 17 | #define IMAGE_DIRECTORY_ENTRY_SECURITY 4 // Security Directory 18 | #define IMAGE_DIRECTORY_ENTRY_BASERELOC 5 // Base Relocation Table 19 | #define IMAGE_DIRECTORY_ENTRY_DEBUG 6 // Debug Directory 20 | // IMAGE_DIRECTORY_ENTRY_COPYRIGHT 7 // (X86 usage) 21 | #define IMAGE_DIRECTORY_ENTRY_ARCHITECTURE 7 // Architecture Specific Data 22 | #define IMAGE_DIRECTORY_ENTRY_GLOBALPTR 8 // RVA of GP 23 | #define IMAGE_DIRECTORY_ENTRY_TLS 9 // TLS Directory 24 | #define IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG 10 // Load Configuration Directory 25 | #define IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT 11 // Bound Import Directory in headers 26 | #define IMAGE_DIRECTORY_ENTRY_IAT 12 // Import Address Table 27 | #define IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT 13 // Delay Load Import Descriptors 28 | #define IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR 14 // COM Runtime descriptor 29 | 30 | #define IMAGE_REL_BASED_ABSOLUTE 0 31 | #define IMAGE_REL_BASED_HIGH 1 32 | #define IMAGE_REL_BASED_LOW 2 33 | #define IMAGE_REL_BASED_HIGHLOW 3 34 | #define IMAGE_REL_BASED_HIGHADJ 4 35 | #define IMAGE_REL_BASED_MIPS_JMPADDR 5 36 | #define IMAGE_REL_BASED_ARM_MOV32 5 37 | #define IMAGE_REL_BASED_THUMB_MOV32 7 38 | #define IMAGE_REL_BASED_MIPS_JMPADDR16 9 39 | #define IMAGE_REL_BASED_IA64_IMM64 9 40 | #define IMAGE_REL_BASED_DIR64 10 41 | 42 | struct image_dos_header { 43 | uint16_t e_magic; 44 | uint16_t e_cblp; 45 | uint16_t e_cp; 46 | uint16_t e_crlc; 47 | uint16_t e_cparhdr; 48 | uint16_t e_minalloc; 49 | uint16_t e_maxalloc; 50 | uint16_t e_ss; 51 | uint16_t e_sp; 52 | uint16_t e_csum; 53 | uint16_t e_ip; 54 | uint16_t e_cs; 55 | uint16_t e_lfarlc; 56 | uint16_t e_ovno; 57 | uint16_t e_res[4]; 58 | uint16_t e_oemid; 59 | uint16_t e_oeminfo; 60 | uint16_t e_res2[10]; 61 | uint32_t e_lfanew; 62 | }; 63 | 64 | struct image_file_header { 65 | uint16_t machine; 66 | uint16_t number_of_sections; 67 | uint32_t time_date_stamp; 68 | uint32_t pointer_to_symbol_table; 69 | uint32_t number_of_symbols; 70 | uint16_t size_of_optional_header; 71 | uint16_t characteristics; 72 | }; 73 | 74 | struct image_data_directory { 75 | uint32_t virtual_address; 76 | uint32_t size; 77 | }; 78 | 79 | struct image_optional_header64 { 80 | uint16_t magic; 81 | uint8_t major_linker_version; 82 | uint8_t minor_linker_version; 83 | uint32_t size_of_code; 84 | uint32_t size_of_initialized_data; 85 | uint32_t size_of_uninitialized_data; 86 | uint32_t address_of_entry_point; 87 | uint32_t base_of_code; 88 | uint64_t image_base; 89 | uint32_t section_alignment; 90 | uint32_t file_alignment; 91 | uint16_t major_operating_system_version; 92 | uint16_t minor_operating_system_version; 93 | uint16_t major_image_version; 94 | uint16_t minor_image_version; 95 | uint16_t major_subsystem_version; 96 | uint16_t minor_subsystem_version; 97 | uint32_t win_32_version_value; 98 | uint32_t size_of_image; 99 | uint32_t size_of_headers; 100 | uint32_t check_sum; 101 | uint16_t subsystem; 102 | uint16_t dll_characteristics; 103 | uint64_t size_of_stack_reserve; 104 | uint64_t size_of_stack_commit; 105 | uint64_t size_of_heap_reserve; 106 | uint64_t size_of_heap_commit; 107 | uint32_t loader_flags; 108 | uint32_t number_of_rva_and_sizes; 109 | struct image_data_directory data_directory[IMAGE_NUMBEROF_DIRECTORY_ENTRIES]; 110 | }; 111 | 112 | struct image_nt_headers64 { 113 | uint32_t signature; 114 | struct image_file_header file_header; 115 | struct image_optional_header64 optional_header; 116 | }; 117 | 118 | struct image_base_relocation { 119 | uint32_t virtual_address; 120 | uint32_t size_of_block; 121 | }; 122 | 123 | struct image_export_directory { 124 | uint32_t characteristics; 125 | uint32_t time_date_stamp; 126 | uint16_t major_version; 127 | uint16_t minor_version; 128 | uint32_t name; 129 | uint32_t base; 130 | uint32_t number_of_functions; 131 | uint32_t number_of_names; 132 | uint32_t address_of_functions; // RVA from base of image 133 | uint32_t address_of_names; // RVA from base of image 134 | uint32_t address_of_name_ordinals; // RVA from base of image 135 | }; 136 | 137 | #pragma pack(pop) 138 | 139 | #endif /* NT_H */ -------------------------------------------------------------------------------- /hypervisor/memory/mem.c: -------------------------------------------------------------------------------- 1 | #include "mem.h" 2 | 3 | static void copy_physical_page(enum copy_dir dir, uintptr_t addr, void *buffer, size_t size) 4 | { 5 | if (dir == COPY_READ) { 6 | memcpy(buffer, (const void *)addr, size); 7 | } else { 8 | memcpy((void *)addr, buffer, size); 9 | } 10 | } 11 | 12 | static pt_entry_64 *get_pte_from_va(cr3 table, void *va, int *level) 13 | { 14 | size_t pml4_idx = ADDRMASK_PML4_INDEX(va); 15 | size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(va); 16 | size_t pde_idx = ADDRMASK_PDE_INDEX(va); 17 | size_t pte_idx = ADDRMASK_PTE_INDEX(va); 18 | 19 | pml4e_64 *pml4 = (pml4e_64 *)((uintptr_t)table.address_of_page_directory * PAGE_SIZE); 20 | pml4e_64 *pml4e = &pml4[pml4_idx]; 21 | if (!pml4e->present) { 22 | *level = 4; 23 | return (pt_entry_64 *)pml4e; 24 | } 25 | 26 | pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE); 27 | pdpte_64 *pdpte = &pdpt[pdpte_idx]; 28 | if (!pdpte->present || pdpte->large_page) { 29 | *level = 3; 30 | return (pt_entry_64 *)pdpte; 31 | } 32 | 33 | pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE); 34 | pde_64 *pde = &pd[pde_idx]; 35 | if (!pde->present || pde->large_page) { 36 | *level = 2; 37 | return (pt_entry_64 *)pde; 38 | } 39 | 40 | pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE); 41 | pte_64 *pte = &pt[pte_idx]; 42 | *level = 1; 43 | return (pt_entry_64 *)pte; 44 | } 45 | 46 | uintptr_t mem_va_to_pa(cr3 table, void *va) 47 | { 48 | int level; 49 | pt_entry_64 *entry = get_pte_from_va(table, va, &level); 50 | if (!entry->present) { 51 | return 0; 52 | } 53 | 54 | switch (level) { 55 | case 4: 56 | die_on(true, "Invalid level of 4 retrieved for va %lX", va); 57 | break; 58 | case 3: 59 | { 60 | pdpte_1gb_64 pdpte; 61 | pdpte.flags = entry->flags; 62 | return (pdpte.page_frame_number * GiB(1)) + ADDRMASK_PDPTE_OFFSET(va); 63 | } 64 | case 2: 65 | { 66 | pde_2mb_64 pde; 67 | pde.flags = entry->flags; 68 | return (pde.page_frame_number * MiB(2)) + ADDRMASK_PDE_OFFSET(va); 69 | } 70 | case 1: 71 | { 72 | pte_64 pte; 73 | pte.flags = entry->flags; 74 | return (pte.page_frame_number * PAGE_SIZE) + ADDRMASK_PTE_OFFSET(va); 75 | } 76 | default: 77 | die_on(true, "Invalid pte level %d for va %lX", level, va); 78 | break; 79 | } 80 | } 81 | 82 | bool mem_copy_virt_tofrom_host(enum copy_dir dir, cr3 table, 83 | uintptr_t addr, void *buffer, size_t size) 84 | { 85 | die_on(!table.flags, "Invalid CR3 value"); 86 | die_on(!addr, "Invalid virtual address"); 87 | die_on(!buffer, "Invalid host buffer"); 88 | die_on(!size, "Invalid size"); 89 | 90 | bool result = true; 91 | 92 | /* 93 | * As buffer referenced in the virtual address may not have a contiguous 94 | * physical address for each page, we need to retrieve and copy pages individually. 95 | */ 96 | while (size) { 97 | /* Calculate how many bytes need to be copied for this page. */ 98 | size_t page_offset = ADDRMASK_PTE_OFFSET(addr); 99 | size_t copy_this_page = PAGE_SIZE - page_offset; 100 | size_t bytes_to_copy = (copy_this_page < size) ? copy_this_page : size; 101 | 102 | /* Get this physical address of this page. */ 103 | uintptr_t phys_addr = mem_va_to_pa(table, (void *)addr); 104 | if (!phys_addr) { 105 | result = false; 106 | break; 107 | } 108 | 109 | /* Do the operation for copying the page. */ 110 | copy_physical_page(dir, phys_addr, buffer, bytes_to_copy); 111 | 112 | /* Update counters for next page. */ 113 | addr += bytes_to_copy; 114 | buffer = (void *)((uintptr_t)buffer + bytes_to_copy); 115 | size -= bytes_to_copy; 116 | } 117 | 118 | return result; 119 | } 120 | 121 | bool mem_copy_virt_to_virt(cr3 src_cr3, void *src, cr3 dest_cr3, void *dest, size_t size) 122 | { 123 | die_on(!src_cr3.flags, "Invalid source CR3 value"); 124 | die_on(!src, "Invalid source value"); 125 | die_on(!dest_cr3.flags, "Invalid dest CR3 value"); 126 | die_on(!dest, "Invalid destination value"); 127 | die_on(!size, "Invalid size specified"); 128 | 129 | bool result = true; 130 | 131 | /* Re-read our guest -> host copy, pretty much identical. */ 132 | uintptr_t virt_src = (uintptr_t)src; 133 | uintptr_t virt_dest = (uintptr_t)dest; 134 | while (size) { 135 | /* Calculate how many bytes we can do from first page. */ 136 | size_t src_page_offset = ADDRMASK_PTE_OFFSET(virt_src); 137 | size_t src_page_bytes = PAGE_SIZE - src_page_offset; 138 | 139 | size_t dest_page_offset = ADDRMASK_PTE_OFFSET(virt_dest); 140 | size_t dest_page_bytes = PAGE_SIZE - dest_page_offset; 141 | 142 | /* Make sure we are not overlapping a copy from dest or source. 143 | * Also make sure we're not copying more than what is available left to copy. */ 144 | size_t copy_bytes = (src_page_bytes < dest_page_bytes) ? src_page_bytes : dest_page_bytes; 145 | copy_bytes = (copy_bytes < size) ? copy_bytes : size; 146 | 147 | /* Get the physical addresses for src & dest. 148 | * Since we map all physical memory into VMROOT, just copy like normal. */ 149 | uint8_t *phys_src = (uint8_t *)mem_va_to_pa(src_cr3, (void *)virt_src); 150 | uint8_t *phys_dest = (uint8_t *)mem_va_to_pa(dest_cr3, (void *)virt_dest); 151 | if (!phys_src || !phys_dest) { 152 | result = false; 153 | break; 154 | } 155 | 156 | memcpy(&phys_dest[dest_page_offset], &phys_src[src_page_offset], copy_bytes); 157 | virt_src += copy_bytes; 158 | virt_dest += copy_bytes; 159 | size -= copy_bytes; 160 | } 161 | 162 | return result; 163 | } 164 | -------------------------------------------------------------------------------- /hypervisor/vmm/vmcall.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "platform/intrin.h" 4 | #include "memory/mem.h" 5 | #include "memory/vmem.h" 6 | #include "handler.h" 7 | #include "vmcall.h" 8 | #include "vmcall_if.h" 9 | #include "vmm_common.h" 10 | 11 | struct vmcall_handler { 12 | /* Linked list pointer. */ 13 | struct vmcall_handler *next; 14 | /* The action ID for identification. */ 15 | vmcall_id_t id; 16 | /* The callback for the vmcall. */ 17 | vmcall_cbk_t callback; 18 | /* Callback specific data. */ 19 | void *opaque; 20 | }; 21 | 22 | struct vmcall_ctx { 23 | /* Hold a linked list of vmcall handlers. */ 24 | struct vmcall_handler *handlers; 25 | /* Back reference to the VMM context */ 26 | struct vmm_ctx *vmm; 27 | }; 28 | 29 | static struct vmcall_handler *find_handler(struct vmcall_ctx *ctx, vmcall_id_t id) 30 | { 31 | struct vmcall_handler *curr = ctx->handlers; 32 | while (curr) { 33 | if (curr->id == id) 34 | return curr; 35 | 36 | curr = curr->next; 37 | } 38 | return NULL; 39 | } 40 | 41 | static vmcall_status_t handle_check_presence(uint8_t *buffer, void *opaque) 42 | { 43 | (void)buffer; 44 | (void)opaque; 45 | DEBUG_PRINT("Guest checked presence."); 46 | return VMCALL_STATUS_OK; 47 | } 48 | 49 | static void vmcall_exit_handle(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 50 | { 51 | static const exception_error_code DEFAULT_EC = { 0 }; 52 | 53 | /* 54 | * Handled VMCALL's from the guest. 55 | * calling convention for the VMCALL interface is as follows: 56 | * 57 | * RCX = SECRET_KEY 58 | * RDX = (struct vmcall_param *) - guest pointer 59 | * 60 | * On return: 61 | * RAX = vmcall status 62 | * 63 | * If RCX is not equal to the secret key, no action taken. 64 | * If present, attempt to read the action parameter and parse. 65 | */ 66 | size_t secret_key = vcpu->guest_context.rcx; 67 | uintptr_t guest_param = vcpu->guest_context.rdx; 68 | 69 | if (secret_key != VMCALL_SECRET_KEY) { 70 | *move_to_next = false; 71 | vmm_inject_guest_event(invalid_opcode, DEFAULT_EC); 72 | return; 73 | } 74 | 75 | /* Ensure we actually have a parameter specified. */ 76 | vmcall_status_t status; 77 | if (!guest_param) { 78 | status = VMCALL_STATUS_INVALID_PARAM; 79 | goto tidyup; 80 | } 81 | 82 | /* Copy the parameter from the guest into host context. */ 83 | cr3 guest_cr3; 84 | guest_cr3.flags = __vmread(VMCS_GUEST_CR3); 85 | die_on(!guest_cr3.flags, "Guest CR3 value cannot be retrieved."); 86 | 87 | struct vmcall_param host_param = { 0 }; 88 | if (!mem_copy_virt_tofrom_host(COPY_READ, guest_cr3, guest_param, 89 | &host_param, sizeof(host_param))) { 90 | status = VMCALL_STATUS_INVALID_PARAM; 91 | goto tidyup; 92 | } 93 | 94 | /* Now lets actually do the VMCALL callback handling. */ 95 | struct vmcall_ctx *ctx = (struct vmcall_ctx *)opaque; 96 | 97 | spin_lock(&ctx->vmm->lock); 98 | 99 | /* Find the handler relevant for the VMCALL. */ 100 | struct vmcall_handler *handler = find_handler(ctx, host_param.id); 101 | if (!handler) { 102 | status = VMCALL_STATUS_INVALID_ID; 103 | goto tidyup_locked; 104 | } 105 | 106 | /* Call the handler and store status. */ 107 | status = handler->callback(host_param.buffer, handler->opaque); 108 | DEBUG_PRINT("VMCALL callback id=%ld status=%ld", host_param.id, status); 109 | 110 | /* Copy the modified host parameter back to guest memory. */ 111 | if (!mem_copy_virt_tofrom_host(COPY_WRITE, guest_cr3, guest_param, 112 | &host_param, sizeof(host_param))) { 113 | status = VMCALL_STATUS_INTERNAL_ERROR; 114 | } 115 | 116 | /* Now store success status. */ 117 | tidyup_locked: 118 | spin_unlock(&ctx->vmm->lock); 119 | tidyup: 120 | vcpu->guest_context.rax = status; 121 | *move_to_next = true; 122 | } 123 | 124 | struct vmcall_ctx *vmcall_init(struct vmm_ctx *vmm) 125 | { 126 | /* Allocate our context structure for VMCALL handling. */ 127 | struct vmcall_ctx *ctx = vmem_alloc(sizeof(struct vmcall_ctx), MEM_WRITE); 128 | die_on(!ctx, "Unable to allocate context for VMCALL handlers."); 129 | vmm->vmcall = ctx; 130 | ctx->vmm = vmm; 131 | 132 | /* Register a VMEXIT reason handler so we can catch & parse VMCALLs. */ 133 | handler_register_exit(vmm->handler, VMX_EXIT_REASON_VMCALL, vmcall_exit_handle, ctx, false); 134 | 135 | /* Register our generic VMCALL events. */ 136 | vmcall_register_action(ctx, VMCALL_ACTION_CHECK_PRESENCE, handle_check_presence, ctx); 137 | 138 | return ctx; 139 | } 140 | 141 | void vmcall_register_action(struct vmcall_ctx *ctx, 142 | vmcall_id_t id, 143 | vmcall_cbk_t callback, 144 | void *opaque) 145 | { 146 | /* Ensure synchronization. */ 147 | spin_lock(&ctx->vmm->lock); 148 | 149 | /* Ensure there isn't already a VMCALL handler registered with same ID. */ 150 | die_on(find_handler(ctx, id), 151 | "Handler already existing for VMCALL id=%ld", id); 152 | 153 | /* Allocate a new handler structure. */ 154 | struct vmcall_handler *new_handler = vmem_alloc(sizeof(struct vmcall_handler), MEM_WRITE); 155 | die_on(!new_handler, "Unable to allocate memory for VMCALL handler."); 156 | 157 | new_handler->id = id; 158 | new_handler->callback = callback; 159 | new_handler->opaque = opaque; 160 | 161 | /* Now add it to the head of our handler list. */ 162 | if (!ctx->handlers) 163 | ctx->handlers = new_handler; 164 | else { 165 | new_handler->next = ctx->handlers; 166 | ctx->handlers = new_handler; 167 | } 168 | 169 | DEBUG_PRINT("VMCALL registered for id %ld cbk 0x%lX opaque 0x%lX", 170 | id, callback, opaque); 171 | spin_unlock(&ctx->vmm->lock); 172 | } -------------------------------------------------------------------------------- /hypervisor/platform/intrin.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | 3 | global __readcs 4 | global __readcr0 5 | global __readcr2 6 | global __readcr3 7 | global __readcr4 8 | global __readdr7 9 | global __rdtsc 10 | global __writecr0 11 | global __writecr3 12 | global __writecr4 13 | global __lidt 14 | global __sidt 15 | global __lgdt 16 | global __sgdt 17 | global __lldt 18 | global __sldt 19 | global __str 20 | global __ltr 21 | global __xsetbv 22 | global __invd 23 | global __invlpg 24 | global __invept 25 | global __vmxon 26 | global __vmclear 27 | global __vmptrld 28 | global __vmwrite 29 | global __vmread 30 | global __vmlaunch 31 | global __vmresume 32 | global __capture_context 33 | global __restore_context 34 | 35 | __readcs: 36 | mov ax, cs 37 | ret 38 | 39 | __readcr0: 40 | mov rax, cr0 41 | ret 42 | 43 | __readcr2: 44 | mov rax, cr2 45 | ret 46 | 47 | __readcr3: 48 | mov rax, cr3 49 | ret 50 | 51 | __readcr4: 52 | mov rax, cr4 53 | ret 54 | 55 | __readdr7: 56 | mov rax, dr7 57 | ret 58 | 59 | __rdtsc: 60 | rdtsc 61 | shl rdx, 32 62 | or rax, rdx 63 | ret 64 | 65 | __writecr0: 66 | mov cr0, rcx 67 | ret 68 | 69 | __writecr3: 70 | mov cr3, rcx 71 | ret 72 | 73 | __writecr4: 74 | mov cr4, rcx 75 | ret 76 | 77 | __lidt: 78 | lidt [rcx] 79 | ret 80 | 81 | __sidt: 82 | sidt [rcx] 83 | ret 84 | 85 | __lgdt: 86 | lgdt [rcx] 87 | ret 88 | 89 | __sgdt: 90 | sgdt [rcx] 91 | ret 92 | 93 | __lldt: 94 | lldt [rcx] 95 | ret 96 | 97 | __sldt: 98 | sldt [rcx] 99 | ret 100 | 101 | __str: 102 | str word [rcx] 103 | ret 104 | 105 | __ltr: 106 | ltr word [rcx] 107 | ret 108 | 109 | __xsetbv: 110 | ; assume RCX already contains operand1 111 | ; move operand2 from RDX into EDX:EAX 112 | mov eax, edx 113 | shr rdx, 32 114 | xsetbv 115 | ret 116 | 117 | __invd: 118 | invd 119 | ret 120 | 121 | __invlpg: 122 | invlpg [rcx] 123 | ret 124 | 125 | __invept: 126 | invept rcx, [rdx] 127 | ret 128 | 129 | __vmxon: 130 | vmxon [rcx] 131 | ret 132 | 133 | __vmclear: 134 | vmclear [rcx] 135 | ret 136 | 137 | __vmptrld: 138 | vmptrld [rcx] 139 | ret 140 | 141 | __vmwrite: 142 | vmwrite rcx, rdx 143 | ret 144 | 145 | __vmread: 146 | vmread rax, rcx 147 | ret 148 | 149 | __vmlaunch: 150 | vmlaunch 151 | ret 152 | 153 | __vmresume: 154 | vmresume 155 | ; No need to RET as context RIP/RSP will change to VMCS_GUEST_R*P 156 | 157 | __capture_context: 158 | ; Push RFLAGS onto the stack 159 | pushfq 160 | 161 | ; Low GP registers 162 | mov [rcx+078h], rax 163 | mov [rcx+080h], rcx 164 | mov [rcx+088h], rdx 165 | mov [rcx+0B8h], r8 166 | mov [rcx+0C0h], r9 167 | mov [rcx+0C8h], r10 168 | mov [rcx+0D0h], r11 169 | 170 | ; Low XMM Registers 171 | movaps [rcx+01A0h], xmm0 172 | movaps [rcx+01B0h], xmm1 173 | movaps [rcx+01C0h], xmm2 174 | movaps [rcx+01D0h], xmm3 175 | movaps [rcx+01E0h], xmm4 176 | movaps [rcx+01F0h], xmm5 177 | 178 | ; Segment selectors 179 | mov word [rcx+038h], cs 180 | mov word [rcx+03Ah], ds 181 | mov word [rcx+03Ch], es 182 | mov word [rcx+042h], ss 183 | mov word [rcx+03Eh], fs 184 | mov word [rcx+040h], gs 185 | 186 | ; High GP registers 187 | mov [rcx+090h], rbx 188 | mov [rcx+0A0h], rbp 189 | mov [rcx+0A8h], rsi 190 | mov [rcx+0B0h], rdi 191 | mov [rcx+0D8h], r12 192 | mov [rcx+0E0h], r13 193 | mov [rcx+0E8h], r14 194 | mov [rcx+0F0h], r15 195 | 196 | ; FPU Control Word 197 | fnstcw word [rcx+0100h] 198 | mov dword [rcx+0102h], 0 199 | 200 | ; High XMM Registers 201 | movaps [rcx+0200h], xmm6 202 | movaps [rcx+0210h], xmm7 203 | movaps [rcx+0220h], xmm8 204 | movaps [rcx+0230h], xmm9 205 | movaps [rcx+0240h], xmm10 206 | movaps [rcx+0250h], xmm11 207 | movaps [rcx+0260h], xmm12 208 | movaps [rcx+0270h], xmm13 209 | movaps [rcx+0280h], xmm14 210 | movaps [rcx+0290h], xmm15 211 | 212 | ; XMM control/status register 213 | stmxcsr dword [rcx+0118h] 214 | stmxcsr dword [rcx+034h] 215 | 216 | ; Fix context RSP values 217 | lea rax, [rsp+010h] 218 | mov [rcx+098h], rax 219 | mov rax, [rsp+08h] 220 | mov [rcx+0F8h], rax 221 | mov eax, [rsp] 222 | mov [rcx+044h], eax 223 | 224 | mov dword [rcx+030h], 10000Fh 225 | 226 | ; Return 227 | add rsp, 8 228 | ret 229 | 230 | __restore_context: 231 | movaps xmm0, [rcx+01A0h] ; 232 | movaps xmm1, [rcx+01B0h] ; 233 | movaps xmm2, [rcx+01C0h] ; 234 | movaps xmm3, [rcx+01D0h] ; 235 | movaps xmm4, [rcx+01E0h] ; 236 | movaps xmm5, [rcx+01F0h] ; 237 | movaps xmm6, [rcx+0200h] ; Restore all XMM registers 238 | movaps xmm7, [rcx+0210h] ; 239 | movaps xmm8, [rcx+0220h] ; 240 | movaps xmm9, [rcx+0230h] ; 241 | movaps xmm10, [rcx+0240h] ; 242 | movaps xmm11, [rcx+0250h] ; 243 | movaps xmm12, [rcx+0260h] ; 244 | movaps xmm13, [rcx+0270h] ; 245 | movaps xmm14, [rcx+0280h] ; 246 | movaps xmm15, [rcx+0290h] ; 247 | ldmxcsr [rcx+034h] ; 248 | 249 | mov rax, [rcx+078h] ; 250 | mov rdx, [rcx+088h] ; 251 | mov r8, [rcx+0B8h] ; Restore volatile registers 252 | mov r9, [rcx+0C0h] ; 253 | mov r10, [rcx+0C8h] ; 254 | mov r11, [rcx+0D0h] ; 255 | 256 | mov rbx, [rcx+090h] ; 257 | mov rsi, [rcx+0A8h] ; 258 | mov rdi, [rcx+0B0h] ; 259 | mov rbp, [rcx+0A0h] ; Restore non volatile regsiters 260 | mov r12, [rcx+0D8h] ; 261 | mov r13, [rcx+0E0h] ; 262 | mov r14, [rcx+0E8h] ; 263 | mov r15, [rcx+0F0h] ; 264 | 265 | cli ; Disable interrupts 266 | push qword [rcx+044h] ; Push RFLAGS on stack 267 | popfq ; Restore RFLAGS 268 | mov rsp, [rcx+098h] ; Restore old stack 269 | push qword [rcx+0F8h] ; Push RIP on old stack 270 | mov rcx, [rcx+080h] ; Restore RCX since we spilled it 271 | ret ; Restore RIP -------------------------------------------------------------------------------- /hypervisor/vmm/vmm_common.h: -------------------------------------------------------------------------------- 1 | #ifndef VMM_COMMON_H 2 | #define VMM_COMMON_H 3 | 4 | #include 5 | #include "platform/intrin.h" 6 | #include "platform/util.h" 7 | #include "vmm_reg.h" 8 | #include "ia32_compact.h" 9 | 10 | /* At max, support up to 100 vCPUs. */ 11 | #define VCPU_MAX 100 12 | 13 | /* Defines the size of the host stack. */ 14 | #define HOST_STACK_SIZE 0x6000 15 | 16 | struct vmm_init_params { 17 | __attribute__((aligned(0x10))) cr3 guest_cr3; 18 | __attribute__((aligned(0x10))) cr3 host_cr3; 19 | __attribute__((aligned(0x10))) segment_descriptor_register_64 guest_idtr; 20 | __attribute__((aligned(0x10))) segment_descriptor_register_64 host_idtr; 21 | uintptr_t image_base; 22 | size_t image_size; 23 | }; 24 | 25 | /* 26 | * Holds whether a vCPU currently has a cached interrupt 27 | * to deliver to the guest. 28 | * 29 | * No synchronisation method is needed for this as 30 | * set/get of this structure will happen within same 31 | * vCPU. 32 | * 33 | * TODO: HOWEVER we should probably account for multiple 34 | * interrupts happening within same VMEXIT frame eventually. 35 | */ 36 | struct cached_interrupt { 37 | exception_vector vector; 38 | exception_error_code code; 39 | bool pending; 40 | }; 41 | 42 | /* Holds the global context for the VMM. */ 43 | struct vmm_ctx { 44 | struct vmm_init_params init; 45 | struct ept_ctx *ept; 46 | struct handler_ctx *handler; 47 | struct vmcall_ctx *vmcall; 48 | spinlock_t lock; 49 | }; 50 | 51 | /* Holds the context specific to a singular vCPU. */ 52 | struct vcpu_ctx { 53 | __attribute__ ((aligned (PAGE_SIZE))) uint8_t host_stack[HOST_STACK_SIZE]; 54 | __attribute__ ((aligned (PAGE_SIZE))) vmxon host_vmxon; 55 | __attribute__ ((aligned (PAGE_SIZE))) vmcs guest_vmcs; 56 | __attribute__ ((aligned (PAGE_SIZE))) uint8_t msr_trap_bitmap[PAGE_SIZE]; 57 | 58 | struct vcpu_context hyperjack_context; 59 | struct control_registers guest_ctrl_regs; 60 | struct vcpu_context guest_context; 61 | struct gdt_config gdt_cfg; 62 | struct cached_interrupt cached_int; 63 | 64 | struct vmm_ctx *vmm; 65 | struct nested_ctx *nested; 66 | size_t idx; 67 | 68 | bool launched; 69 | size_t last_ignored_msr; 70 | }; 71 | 72 | static inline struct vcpu_ctx *vmm_get_vcpu_ctx(void) 73 | { 74 | /* 75 | * Dirty hack, as GS_BASE is actually unused on x86_64 76 | * we can use this field in the host context to store/retrieve 77 | * the vCPU context pointer. 78 | */ 79 | struct vcpu_ctx *vcpu = (struct vcpu_ctx *)rdmsr(IA32_GS_BASE); 80 | die_on(!vcpu, "vCPU context not correct."); 81 | return vcpu; 82 | } 83 | 84 | static inline size_t vmm_read_gp_register(struct vcpu_ctx *vcpu, uint64_t base_reg) 85 | { 86 | assert(base_reg < 16); 87 | 88 | size_t reg_val; 89 | if (base_reg == 4 /* RSP is 4th in array context. */) 90 | reg_val = __vmread(VMCS_GUEST_RSP); 91 | else { 92 | uint64_t *gp_arr = &vcpu->guest_context.rax; 93 | reg_val = gp_arr[base_reg]; 94 | } 95 | return reg_val; 96 | } 97 | 98 | static inline size_t vmm_read_seg_register(struct vcpu_ctx *vcpu, uint64_t seg_index) 99 | { 100 | assert(seg_index < 6); 101 | 102 | switch (seg_index) { 103 | case 0: 104 | return vcpu->guest_context.seg_es; 105 | case 1: 106 | return vcpu->guest_context.seg_cs; 107 | case 2: 108 | return vcpu->guest_context.seg_ss; 109 | case 3: 110 | return vcpu->guest_context.seg_ds; 111 | case 4: 112 | return vcpu->guest_context.seg_fs; 113 | case 5: 114 | return vcpu->guest_context.seg_gs; 115 | } 116 | } 117 | 118 | static inline void vmm_set_cached_interrupt(exception_vector vector, exception_error_code code) 119 | { 120 | struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx(); 121 | 122 | vcpu->cached_int.vector = vector; 123 | vcpu->cached_int.code = code; 124 | vcpu->cached_int.pending = true; 125 | } 126 | 127 | static inline void vmm_msr_trap_enable(uint8_t *bitmap, size_t msr, bool trap) 128 | { 129 | static const size_t LOW_START = 0x0; 130 | static const size_t LOW_END = 0x1fff; 131 | static const size_t HIGH_START = 0xc0000000; 132 | static const size_t HIGH_END = 0xc0001fff; 133 | 134 | uint8_t *read_low = &bitmap[0]; 135 | uint8_t *read_high = &bitmap[1024]; 136 | uint8_t *write_low = &bitmap[2048]; 137 | uint8_t *write_high = &bitmap[3072]; 138 | 139 | if ((msr >= LOW_START) && (msr <= LOW_END)) { 140 | if (trap) { 141 | bitmap_set_bit(read_low, msr); 142 | bitmap_set_bit(write_low, msr); 143 | } else { 144 | bitmap_clear_bit(read_low, msr); 145 | bitmap_clear_bit(write_low, msr); 146 | } 147 | } else if ((msr >= HIGH_START) && (msr <= HIGH_END)) { 148 | size_t offset = msr - HIGH_START; 149 | if (trap) { 150 | bitmap_set_bit(read_high, offset); 151 | bitmap_set_bit(write_high, offset); 152 | } else { 153 | bitmap_clear_bit(read_high, offset); 154 | bitmap_clear_bit(write_high, offset); 155 | } 156 | } else { 157 | die_on(false, "MSR 0x%lX out of valid range", msr); 158 | } 159 | } 160 | 161 | static inline uint8_t vmm_virt_addr_bits() 162 | { 163 | return __vmread(VMCS_HOST_CR4) & CR4_LA57_MASK ? 57 : 48; 164 | } 165 | 166 | static inline uint64_t get_canonical(uint64_t la, uint8_t vaddr_bits) 167 | { 168 | return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits); 169 | } 170 | 171 | static inline bool is_noncanonical_address(uint64_t la) 172 | { 173 | return get_canonical(la, vmm_virt_addr_bits()) != la; 174 | } 175 | 176 | static inline cr0 vmm_adjust_cr0(cr0 old_cr0) 177 | { 178 | cr0 fixed0, fixed1; 179 | cr0 new_cr0 = old_cr0; 180 | fixed0.flags = rdmsr(IA32_VMX_CR0_FIXED0); 181 | fixed1.flags = rdmsr(IA32_VMX_CR0_FIXED1); 182 | 183 | new_cr0.flags &= fixed1.flags; 184 | new_cr0.flags |= fixed0.flags; 185 | 186 | DEBUG_PRINT("old_cr0=0x%lX new_cr0=0x%lX fixed0=0x%lX fixed1=0x%lX", 187 | old_cr0.flags, new_cr0.flags, fixed0.flags, fixed1.flags); 188 | 189 | return new_cr0; 190 | } 191 | 192 | static inline cr4 vmm_adjust_cr4(cr4 old_cr4) 193 | { 194 | cr4 fixed0, fixed1; 195 | cr4 new_cr4 = old_cr4; 196 | fixed0.flags = rdmsr(IA32_VMX_CR4_FIXED0); 197 | fixed1.flags = rdmsr(IA32_VMX_CR4_FIXED1); 198 | 199 | new_cr4.flags &= fixed1.flags; 200 | new_cr4.flags |= fixed0.flags; 201 | 202 | DEBUG_PRINT("old_cr4=0x%lX new_cr4=0x%lX fixed0=0x%lX fixed1=0x%lX", 203 | old_cr4.flags, new_cr4.flags, fixed0.flags, fixed1.flags); 204 | 205 | return new_cr4; 206 | } 207 | 208 | void vmm_inject_guest_event(exception_vector vector, exception_error_code code); 209 | 210 | #endif /* VMM_COMMON_H */ -------------------------------------------------------------------------------- /hypervisor/memory/pmem.c: -------------------------------------------------------------------------------- 1 | //#define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "platform/spinlock.h" 4 | #include "pmem.h" 5 | 6 | /* 7 | * Standalone physical memory manager. 8 | * This module can be used for allocating pages of physical memory within 9 | * the host system. The lowest granularity supported is the size of a 4K page 10 | * due to page table restrictions (x86_64) as this is used with things such as 11 | * EPT where 4k pages are needed this is no problem (and no need to create a 12 | * heap allocator then). 13 | * 14 | * The MM does not use 0xE820 or the EFI equivalent as the goal of this system 15 | * is a bit different. Instead we use a reserved PMEM range that is effectively 16 | * a uint8_t array within the .data? section of the application. The reasoning 17 | * behind this is that in the future when we look at implementing EPT and potentially 18 | * hiding the EFI driver from the guest OS running on the system all allocated/used 19 | * memory is actually within actual image so we don't have to traverse or keep 20 | * a record of memory allocated elsewhere. 21 | */ 22 | 23 | /* Defines the size of physical memory that can be used. */ 24 | #define PMEM_SIZE MiB(20) 25 | #define PMEM_PAGE_COUNT (PMEM_SIZE / PAGE_SIZE) 26 | 27 | #define PAGE_COUNT(byte_count) ((byte_count + PAGE_SIZE - 1) / PAGE_SIZE) 28 | #define SET_N_BITS(n) ((1 << n) - 1) 29 | 30 | /* 31 | * This *SHOULD* reside in the .bss section 32 | * not affecting actual PE size, however will be allocated at load 33 | * time of the image. 34 | */ 35 | static uint8_t __attribute__ ((aligned (PAGE_SIZE))) pmem_region[PMEM_SIZE] = { 0 }; 36 | 37 | /* 38 | * A bitmap will be used for storing which pages are used/free 39 | * Each bit represents a page. We will also store the index of 40 | * the last page allocated, this is to speed up allocation 41 | * when contiguous pages need to be allocated (however this 42 | * will not help for when we overflow or memory is freed). 43 | * 44 | * TODO: Make this better..... 45 | */ 46 | static size_t pmem_bitmap[PMEM_PAGE_COUNT / NUMBER_BITS_TYPE(size_t)]; 47 | static size_t pmem_last_chunk_idx; 48 | static size_t pmem_last_bit_idx; 49 | 50 | static spinlock_t lock; 51 | static size_t total_allocated = 0; 52 | 53 | static inline bool find_contiguous_unset(size_t count, 54 | size_t *found_chunk_idx, 55 | size_t *found_bit_idx) 56 | { 57 | size_t chunk_idx = pmem_last_chunk_idx; 58 | size_t bit_idx = pmem_last_bit_idx; 59 | 60 | do { 61 | 62 | size_t curr_count = 0; 63 | while (1) { 64 | 65 | /* Check to see if current bit is unsed. */ 66 | if (!((pmem_bitmap[chunk_idx] >> bit_idx) & 1)) 67 | curr_count++; 68 | else 69 | curr_count = 0; 70 | 71 | /* If we have found N contiguous bits return value. */ 72 | if (curr_count == count) { 73 | *found_chunk_idx = chunk_idx; 74 | *found_bit_idx = bit_idx; 75 | return true; 76 | } 77 | 78 | /* Increment the chunk and bit indexes. */ 79 | bit_idx = (bit_idx + 1) % NUMBER_BITS_TYPE(pmem_bitmap[0]); 80 | if (!bit_idx) 81 | chunk_idx = (chunk_idx + 1) % ARRAY_SIZE(pmem_bitmap); 82 | 83 | /* 84 | * If chunk index has iterated to beginning of bitmap 85 | * we must reset current count as overflow cannot count 86 | * as contiguous. 87 | */ 88 | if (!chunk_idx && !bit_idx) 89 | curr_count = 0; 90 | } 91 | } while ((chunk_idx != pmem_last_chunk_idx) && (bit_idx != pmem_last_bit_idx)); 92 | 93 | return false; 94 | } 95 | 96 | static inline void set_contiguous_bits(size_t chunk_idx, size_t bit_idx, size_t count, bool val) 97 | { 98 | for (size_t i = 0; i < count; i++) { 99 | die_on(((pmem_bitmap[chunk_idx] >> bit_idx) & 1) == val, 100 | "Bit already set, chunk %d bit %d", 101 | chunk_idx, bit_idx); 102 | 103 | if (val) 104 | pmem_bitmap[chunk_idx] |= 1ull << bit_idx; 105 | else 106 | pmem_bitmap[chunk_idx] &= 1ull << bit_idx; 107 | 108 | bit_idx = (bit_idx + 1) % NUMBER_BITS_TYPE(pmem_bitmap[0]); 109 | if (!bit_idx) 110 | chunk_idx = (chunk_idx + 1) % ARRAY_SIZE(pmem_bitmap); 111 | } 112 | } 113 | 114 | void pmem_init(void) 115 | { 116 | /* Lets ensure everything is cleared/zero'd. */ 117 | pmem_last_chunk_idx = 0; 118 | pmem_last_bit_idx = 0; 119 | memset(pmem_bitmap, 0, sizeof(pmem_bitmap)); 120 | memset(pmem_region, 0, sizeof(pmem_region)); 121 | 122 | spin_init(&lock); 123 | } 124 | 125 | uintptr_t pmem_alloc_page(void) 126 | { 127 | spin_lock(&lock); 128 | 129 | /* Search for 1 contiguous page in the bitmap that is unset. */ 130 | size_t chunk_idx; 131 | size_t bit_idx; 132 | if (find_contiguous_unset(1, &chunk_idx, &bit_idx)) { 133 | DEBUG_PRINT("Found chunk %d bit %d", chunk_idx, bit_idx); 134 | 135 | /* Indicate that bit & page is now in use. */ 136 | set_contiguous_bits(chunk_idx, bit_idx, 1, true); 137 | 138 | /* Update the last index stored to help speed up future allocations. */ 139 | pmem_last_chunk_idx = chunk_idx; 140 | pmem_last_bit_idx = bit_idx; 141 | 142 | /* Calculate & return the allocated page's address. */ 143 | size_t offset = (chunk_idx * NUMBER_BITS_TYPE(pmem_bitmap[0])) + bit_idx; 144 | offset *= PAGE_SIZE; 145 | 146 | uint8_t *result = &pmem_region[offset]; 147 | memset(result, 0, PAGE_SIZE); 148 | spin_unlock(&lock); 149 | return (uintptr_t)result; 150 | } 151 | 152 | spin_unlock(&lock); 153 | return 0; 154 | } 155 | 156 | uintptr_t pmem_alloc_contiguous(size_t bytes) 157 | { 158 | die_on(!bytes, "Invalid parameter unable to allocate 0 bytes."); 159 | die_on(bytes > (NUMBER_BITS_TYPE(pmem_bitmap[0]) * PAGE_SIZE), 160 | "Current pmem allocator cannot allocate enough pages to fit %d bytes", 161 | bytes); 162 | 163 | const size_t number_pages = PAGE_COUNT(bytes); 164 | 165 | spin_lock(&lock); 166 | 167 | size_t chunk_idx; 168 | size_t bit_idx; 169 | if (find_contiguous_unset(number_pages, &chunk_idx, &bit_idx)) { 170 | DEBUG_PRINT("Found chunk %d bit %d", chunk_idx, bit_idx); 171 | 172 | /* Indicate the following bits are used. */ 173 | set_contiguous_bits(chunk_idx, bit_idx, number_pages, true); 174 | 175 | /* Update the last index stored to help speed up future allocations. */ 176 | pmem_last_chunk_idx = chunk_idx; 177 | pmem_last_bit_idx = bit_idx; 178 | 179 | /* Calculate the physical address of the buffer. */ 180 | size_t offset_chunk = (NUMBER_BITS_TYPE(pmem_bitmap[0]) * chunk_idx) * PAGE_SIZE; 181 | size_t offset_bit = bit_idx * PAGE_SIZE; 182 | 183 | uint8_t *result = &pmem_region[offset_chunk + offset_bit]; 184 | memset(result, 0, bytes); 185 | spin_unlock(&lock); 186 | total_allocated += PAGE_COUNT(bytes) * PAGE_SIZE; 187 | return (uintptr_t)result; 188 | } 189 | 190 | spin_unlock(&lock); 191 | return 0; 192 | } 193 | 194 | void pmem_free_page(uintptr_t page) 195 | { 196 | /* Assert that the memory to free is within our allocator range. 197 | * and that it is page aligned. */ 198 | assert(page >= (uintptr_t)pmem_region); 199 | assert(page < (uintptr_t)pmem_region + sizeof(pmem_region)); 200 | assert(!(page & (PAGE_SIZE - 1))); 201 | 202 | spin_lock(&lock); 203 | 204 | /* Calculate the page index in the bitmap. */ 205 | size_t offset = page - (uintptr_t)&pmem_region[0]; 206 | size_t full_bit_index = offset / PAGE_SIZE; 207 | 208 | size_t chunk_idx = full_bit_index / NUMBER_BITS_TYPE(pmem_bitmap[0]); 209 | size_t bit_idx = full_bit_index % NUMBER_BITS_TYPE(pmem_bitmap[0]); 210 | 211 | set_contiguous_bits(chunk_idx, bit_idx, 1, false); 212 | spin_unlock(&lock); 213 | } -------------------------------------------------------------------------------- /hypervisor/memory/vmem.c: -------------------------------------------------------------------------------- 1 | //#define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "platform/spinlock.h" 4 | #include "platform/intrin.h" 5 | #include "memory/pmem.h" 6 | #include "memory/vmem.h" 7 | #include "memory/mem.h" 8 | 9 | /* Standalone virtual memory manager. 10 | * This module created the page tables required to implement virtual memory 11 | * which will be used the by HOST/ROOT of the hypervisor. 12 | * When booted from an EFI environment normally memory is identity mapped 13 | * with a 1:1 of physical to virtual memory. We are going to do the same 14 | * for here. 15 | * 16 | * However, physical memory (allocated via the pmem module) can be allocated 17 | * for use (so contiguous pages can exist) into virtual memory. 18 | * The first 512GB of vmem will be identity mapped, and then the second 512GB (PML4 idx 1) 19 | * address starting at 0x8000000000 of virtual space will be used for memory allocated 20 | * within here. 21 | * 22 | * This allows for an easy seperation and determination of identity mapped vs allocated 23 | * memory. 24 | */ 25 | 26 | #define ENTRIES_PER_TABLE 512 27 | #define DYN_VMEM_START GiB(512) 28 | 29 | struct vmem_ctx { 30 | /* Describes the 512 contiguous 512GB memory regions. */ 31 | __attribute__ ((aligned (PAGE_SIZE))) pml4e_64 pml4[ENTRIES_PER_TABLE]; 32 | 33 | /* Describes the first 512 1GB memory regions within PML4[0] used for identity mapping. 34 | * These will be set as large pages. 35 | * So that we don't need to go to lower granularity (2MB or 4k). */ 36 | __attribute__ ((aligned (PAGE_SIZE))) pdpte_1gb_64 identity_pdpt[ENTRIES_PER_TABLE]; 37 | 38 | /* 39 | * pml4e_64 40 | * --- pdpte_64 41 | * ------ pde_64 42 | * ---------pte_64 43 | */ 44 | 45 | uintptr_t next_free_addr; 46 | spinlock_t sync; 47 | }; 48 | 49 | static struct vmem_ctx *m_ctx = NULL; 50 | 51 | static void init_identity_table(struct vmem_ctx *ctx) 52 | { 53 | /* Set out the first PML4E to indicate this is present 54 | * and this is what we'll be using for identity mapping. */ 55 | ctx->pml4[0].present = true; 56 | ctx->pml4[0].write = true; 57 | ctx->pml4[0].page_frame_number = ((uintptr_t)ctx->identity_pdpt) / PAGE_SIZE; 58 | 59 | for (size_t i = 0; i < 512; i++) { 60 | ctx->identity_pdpt[i].present = true; 61 | ctx->identity_pdpt[i].write = true; 62 | ctx->identity_pdpt[i].execute_disable = false; 63 | ctx->identity_pdpt[i].large_page = true; 64 | ctx->identity_pdpt[i].page_frame_number = i; 65 | } 66 | } 67 | 68 | static void create_table_entries(uintptr_t addr, bool write, bool exec) 69 | { 70 | DEBUG_PRINT("Creating page tables for address %lX write %d exec %d", addr, write, exec); 71 | 72 | size_t pml4_idx = ADDRMASK_PML4_INDEX(addr); 73 | size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(addr); 74 | size_t pde_idx = ADDRMASK_PDE_INDEX(addr); 75 | size_t pte_idx = ADDRMASK_PTE_INDEX(addr); 76 | DEBUG_PRINT("PML4[%d] PDPTE[%d] PDE[%d] PTE[%d]", pml4_idx, pdpte_idx, pde_idx, pte_idx); 77 | 78 | pml4e_64 *pml4e = &m_ctx->pml4[pml4_idx]; 79 | DEBUG_PRINT("--- PML4E PFN[ADDR] %lX[%lX]", (uintptr_t)pml4e / PAGE_SIZE, pml4e); 80 | 81 | if (!pml4e->present) { 82 | pml4e->write = true; 83 | pml4e->page_frame_number = pmem_alloc_page() / PAGE_SIZE; 84 | die_on(!pml4e->page_frame_number, "Could not allocate PML4E for addr %lX", addr); 85 | pml4e->present = true; 86 | } 87 | 88 | pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE); 89 | pdpte_64 *pdpte = &pdpt[pdpte_idx]; 90 | DEBUG_PRINT("--- PDPTE PFN[ADDR] %lX[%lX]", (uintptr_t)pdpte / PAGE_SIZE, pdpte); 91 | 92 | if (!pdpte->present) { 93 | pdpte->write = true; 94 | pdpte->page_frame_number = pmem_alloc_page() / PAGE_SIZE; 95 | die_on(!pdpte->page_frame_number, "Could not allocate PDPTE for addr %lX", addr); 96 | pdpte->present = true; 97 | } 98 | 99 | pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE); 100 | pde_64 *pde = &pd[pde_idx]; 101 | DEBUG_PRINT("--- PDE PFN[ADDR] %lX[%lX]", (uintptr_t)pde / PAGE_SIZE, pde); 102 | 103 | if (!pde->present) { 104 | pde->write = true; 105 | pde->page_frame_number = pmem_alloc_page() / PAGE_SIZE; 106 | die_on(!pde->page_frame_number, "Could not allocate PDE for addr %lX", addr); 107 | pde->present = true; 108 | } 109 | 110 | pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE); 111 | pte_64 *pte = &pt[pte_idx]; 112 | DEBUG_PRINT("--- PTE PFN[ADDR] %lX[%lX]", (uintptr_t)pte / PAGE_SIZE, pte); 113 | 114 | die_on(pte->present, "PTE is already present for addr %lX", addr); 115 | pte->write = write; 116 | pte->execute_disable = !exec; 117 | pte->page_frame_number = pmem_alloc_page() / PAGE_SIZE; 118 | die_on(!pte->page_frame_number, "Could not allocate PTE for addr %lX", addr); 119 | pte->present = true; 120 | DEBUG_PRINT("--- Allocated page memory PFN[ADDR] %lX[%lX]", 121 | pte->page_frame_number, pte->page_frame_number * PAGE_SIZE); 122 | 123 | /* Invalidate the TLB for address. */ 124 | __invlpg(&addr); 125 | 126 | #ifdef DEBUG_MODULE 127 | /* Some debug code that ensures that the created PA matches if we 128 | * traverse the VA back to PA. */ 129 | cr3 tmp_cr3; 130 | tmp_cr3.flags = __readcr3(); 131 | uintptr_t actual_pa = pte->page_frame_number * PAGE_SIZE; 132 | uintptr_t calc_pa = mem_va_to_pa(tmp_cr3, (void *)addr); 133 | die_on(actual_pa != calc_pa, 134 | "Physical addr %lX does not match calculated physical addr %lX", 135 | actual_pa, calc_pa); 136 | #endif 137 | } 138 | 139 | static void modify_entry_perms(uintptr_t addr, bool write, bool exec) 140 | { 141 | DEBUG_PRINT("Modifying page table entries for address %lX write %d exec %d", addr, write, exec); 142 | 143 | size_t pml4_idx = ADDRMASK_PML4_INDEX(addr); 144 | size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(addr); 145 | size_t pde_idx = ADDRMASK_PDE_INDEX(addr); 146 | size_t pte_idx = ADDRMASK_PTE_INDEX(addr); 147 | DEBUG_PRINT("PML4[%d] PDPTE[%d] PDE[%d] PTE[%d]", pml4_idx, pdpte_idx, pde_idx, pte_idx); 148 | 149 | pml4e_64 *pml4e = &m_ctx->pml4[pml4_idx]; 150 | die_on(!pml4e->present, "PML4E for addr %lX not present", addr); 151 | 152 | pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE); 153 | pdpte_64 *pdpte = &pdpt[pdpte_idx]; 154 | die_on(!pdpte->present, "PDPTE for addr %lX not present", addr); 155 | 156 | pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE); 157 | pde_64 *pde = &pd[pde_idx]; 158 | 159 | die_on(!pde->present, "PDE for addr %lX not present", addr); 160 | 161 | pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE); 162 | pte_64 *pte = &pt[pte_idx]; 163 | die_on(!pte->present, "PTE for addr %lX not present", addr); 164 | pte->write = write; 165 | pte->execute_disable = !exec; 166 | 167 | /* Invalidate the TLB for address. */ 168 | __invlpg(&addr); 169 | } 170 | 171 | void vmem_init(cr3 *original_cr3, cr3 *new_cr3) 172 | { 173 | /* Store the original CR3 value before initialising the virtual-memory manager. */ 174 | original_cr3->flags = __readcr3(); 175 | DEBUG_PRINT("Storing original CR3 %lX", original_cr3->flags); 176 | 177 | /* 178 | * Allocated a page for the vmem context. 179 | * Unfortunately as we are the virtual memory manager we cannot 180 | * allocate virtual memory to do this (for obvious reasons) 181 | * so we will allocate via pmem (it will be identity mapped either way) 182 | */ 183 | m_ctx = (struct vmem_ctx *)pmem_alloc_contiguous(sizeof(struct vmem_ctx)); 184 | die_on(!m_ctx, "Unable to allocate contact for virtual memory manager."); 185 | 186 | /* Clear main root PML4. */ 187 | memset(m_ctx->pml4, 0, sizeof(m_ctx->pml4)); 188 | 189 | /* Initialise the page table with identity mapping for the environment. */ 190 | init_identity_table(m_ctx); 191 | 192 | /* Set the next free address in the dynamic allocator. */ 193 | m_ctx->next_free_addr = DYN_VMEM_START; 194 | 195 | spin_init(&m_ctx->sync); 196 | 197 | /* Write the new CR3 value so that the memory manager is used. */ 198 | new_cr3->page_level_cache_disable = original_cr3->page_level_cache_disable; 199 | new_cr3->page_level_write_through = original_cr3->page_level_write_through; 200 | new_cr3->address_of_page_directory = ((uintptr_t)m_ctx->pml4) / PAGE_SIZE; 201 | __writecr3(new_cr3->flags); 202 | DEBUG_PRINT("New CR3 value loaded %lX", new_cr3->flags); 203 | 204 | /* Ensure everything in EFER is correct. */ 205 | ia32_efer_register efer = { 0 }; 206 | efer.flags = rdmsr(IA32_EFER); 207 | die_on(!efer.ia32e_mode_active, "ia32e_mode not active"); 208 | die_on(!efer.ia32e_mode_enable, "ia32e_mode not enabled"); 209 | if (!efer.execute_disable_bit_enable) { 210 | DEBUG_PRINT("EFER.NX not enabled, setting now."); 211 | efer.execute_disable_bit_enable = true; 212 | wrmsr(IA32_EFER, efer.flags); 213 | } 214 | } 215 | 216 | void *vmem_alloc(size_t size, unsigned int flags) 217 | { 218 | /* 219 | * For each of the pages for the address specified from our next 220 | * free start address create a table entry. 221 | */ 222 | spin_lock(&m_ctx->sync); 223 | 224 | DEBUG_PRINT("Unaligned size: %ld", size); 225 | /* Align size to next largest page. */ 226 | size = (size & PAGE_MASK) ? ((size + PAGE_SIZE) & ~PAGE_MASK) : size; 227 | DEBUG_PRINT("Attempting allocation of size: %ld", size); 228 | 229 | /* Determine info from flags. */ 230 | bool write = (flags & MEM_WRITE) != 0; 231 | bool exec = (flags & MEM_EXECUTE) != 0; 232 | 233 | uintptr_t start_addr = m_ctx->next_free_addr; 234 | uintptr_t end_addr = start_addr + size; 235 | DEBUG_PRINT("Start addr 0x%lX end addr 0x%lX diff 0x%lX", start_addr, end_addr, end_addr - start_addr); 236 | 237 | for (uintptr_t curr_addr = start_addr; 238 | curr_addr < end_addr; 239 | curr_addr += PAGE_SIZE) { 240 | 241 | /* Allocate the pages tables for the address needed. */ 242 | create_table_entries(curr_addr, write, exec); 243 | } 244 | 245 | m_ctx->next_free_addr = end_addr; 246 | die_on(m_ctx->next_free_addr < DYN_VMEM_START, 247 | "The virtual memory manager's next_free_addr has iterated back into the" \ 248 | "identity mapped area, we should probably create an algorithm to reuse" \ 249 | "freed memory ranges."); 250 | 251 | spin_unlock(&m_ctx->sync); 252 | 253 | return (void *)start_addr; 254 | } 255 | 256 | void vmem_change_perms(void *addr, size_t size, unsigned int flags) 257 | { 258 | spin_lock(&m_ctx->sync); 259 | 260 | /* Determine info from flags. */ 261 | bool write = (flags & MEM_WRITE) != 0; 262 | bool exec = (flags & MEM_EXECUTE) != 0; 263 | 264 | uintptr_t start_addr = (uintptr_t)addr; 265 | uintptr_t end_addr = start_addr + size; 266 | 267 | /* Iterate each page and change permissions. */ 268 | for (uintptr_t curr_addr = start_addr; 269 | curr_addr < end_addr; 270 | curr_addr += PAGE_SIZE) { 271 | 272 | modify_entry_perms(curr_addr, write, exec); 273 | } 274 | 275 | spin_unlock(&m_ctx->sync); 276 | } 277 | 278 | void vmem_free(void *addr, size_t size) 279 | { 280 | /* TODO: Unfortunately need to pass size here, unless 281 | * we keep a VAD style map logging all of our allocations 282 | * (I'd rather kill myself than add more complexity to this 283 | * considering this is not the main goal of the project). */ 284 | die_on(true, "vmem_free not implemented as of yet."); 285 | (void)addr; 286 | (void)size; 287 | } -------------------------------------------------------------------------------- /hypervisor/vmm/nested.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "memory/mem.h" 4 | #include "memory/vmem.h" 5 | #include "vmm.h" 6 | #include "handler.h" 7 | #include "vmm_common.h" 8 | #include "nested.h" 9 | 10 | #define NESTED_REVISION_ID 0x0000BEEF 11 | 12 | struct nested_ctx { 13 | uint32_t vm_instruction_error; 14 | gpa_t vmxon_ptr; 15 | }; 16 | 17 | static void set_vmx_success_flags() 18 | { 19 | rfl rflags; 20 | rflags.flags = __vmread(VMCS_GUEST_RFLAGS); 21 | 22 | rflags.carry_flag = 0; 23 | rflags.parity_flag = 0; 24 | rflags.auxiliary_carry_flag = 0; 25 | rflags.zero_flag = 0; 26 | rflags.sign_flag = 0; 27 | rflags.overflow_flag = 0; 28 | __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags); 29 | } 30 | 31 | static void set_vmx_fail_invalid_flags() 32 | { 33 | rfl rflags; 34 | rflags.flags = __vmread(VMCS_GUEST_RFLAGS); 35 | 36 | rflags.carry_flag = 1; 37 | rflags.parity_flag = 0; 38 | rflags.auxiliary_carry_flag = 0; 39 | rflags.zero_flag = 0; 40 | rflags.sign_flag = 0; 41 | rflags.overflow_flag = 0; 42 | __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags); 43 | } 44 | 45 | // static void set_vmx_fail_valid_flags(struct nested_ctx *nested, uint32_t err_no) 46 | // { 47 | // rfl rflags; 48 | // rflags.flags = __vmread(VMCS_GUEST_RFLAGS); 49 | 50 | // rflags.carry_flag = 0; 51 | // rflags.parity_flag = 0; 52 | // rflags.auxiliary_carry_flag = 0; 53 | // rflags.zero_flag = 1; 54 | // rflags.sign_flag = 0; 55 | // rflags.overflow_flag = 0; 56 | // __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags); 57 | // nested->vm_instruction_error = err_no; 58 | // } 59 | 60 | static bool is_nested_enabled(struct vcpu_ctx *vcpu) 61 | { 62 | return (vcpu->nested != NULL); 63 | } 64 | 65 | static void enable_nested_vmx(struct vcpu_ctx *vcpu) 66 | { 67 | assert(!is_nested_enabled(vcpu)); 68 | 69 | vcpu->nested = vmem_alloc(sizeof(struct nested_ctx), MEM_WRITE); 70 | die_on(!vcpu->nested, "Unable to allocate memory for vCPU %d nested virt.", vcpu->idx); 71 | 72 | DEBUG_PRINT("Nested virtualization enabled."); 73 | } 74 | 75 | static bool is_attempt_enable_vmx(struct vcpu_ctx *vcpu, 76 | vmx_exit_qualification_cr_access qual) 77 | { 78 | if (qual.cr_number != VMX_EXIT_QUALIFICATION_REGISTER_CR4) 79 | return false; 80 | 81 | if (qual.access_type != VMX_EXIT_QUALIFICATION_ACCESS_MOV_TO_CR) 82 | return false; 83 | 84 | /* For RSP register access we have to directly read guest RSP. */ 85 | uint64_t reg_val = vmm_read_gp_register(vcpu, qual.gp_register); 86 | DEBUG_PRINT("Attempt to set CR4 to val 0x%lX", reg_val); 87 | 88 | /* Check to see if guest is trying to set VMXE. */ 89 | if (!(reg_val & CR4_VMXE_MASK)) 90 | return false; 91 | 92 | /* If nested is already enabled do nothing. */ 93 | if (is_nested_enabled(vcpu)) 94 | return false; 95 | 96 | enable_nested_vmx(vcpu); 97 | return true; 98 | } 99 | 100 | static bool get_vmx_mem_address(struct vcpu_ctx *vcpu, 101 | uintptr_t qual, 102 | vmx_vmexit_instruction_info_vmx_and_xsaves info, 103 | gpa_t *mem_address) 104 | { 105 | gpa_t offset = qual; 106 | DEBUG_PRINT("Qual set to offset value 0x%lX", offset); 107 | 108 | if (info.base_register_invalid == false) { 109 | /* Read the base register and add to offset. */ 110 | uint64_t base_reg = vmm_read_gp_register(vcpu, info.base_register); 111 | offset += base_reg; 112 | DEBUG_PRINT("Adding base register %d value 0x%lX new offset 0x%lX", 113 | info.base_register, base_reg, offset); 114 | } 115 | 116 | if (info.gp_register_invalid == false) { 117 | /* Read the index register, and add to offset with scale. */ 118 | uint64_t index_reg = vmm_read_gp_register(vcpu, info.gp_register); 119 | offset += (index_reg << info.scaling); 120 | DEBUG_PRINT("Adding index register 0x%lX with scaling 0x%X new offset 0x%lX", 121 | index_reg, info.scaling, offset); 122 | } 123 | 124 | /* Deal with the address sizing. */ 125 | if (info.address_size == 1 /* 32 bit */) 126 | offset &= 0xFFFFFFFF; 127 | else if (info.address_size == 0 /* 16 bit */) 128 | offset &= 0xFFFF; 129 | 130 | /* 131 | * Check to see if guest in long mode, 132 | * if not we throw a fault as we do not support anything else. 133 | */ 134 | ia32_efer_register efer; 135 | efer.flags = __vmread(VMCS_GUEST_EFER); 136 | die_on(!efer.ia32e_mode_enable, "Nested virt not supported when not in long mode"); 137 | 138 | /* If GS or FS set, add these to the offset. */ 139 | if (info.segment_register == SEG_FS) 140 | offset += __vmread(VMCS_GUEST_FS_BASE); 141 | else if (info.segment_register == SEG_GS) 142 | offset += __vmread(VMCS_GUEST_GS_BASE); 143 | 144 | /* Ensure address falls in range of canonical addresses. */ 145 | if (is_noncanonical_address(offset)) { 146 | DEBUG_PRINT("Non canonical address specified 0x%lX", offset); 147 | static const exception_error_code DEFAULT_EC = { 0 }; 148 | vmm_inject_guest_event(info.segment_register == SEG_SS ? 149 | stack_segment_fault : general_protection, 150 | DEFAULT_EC); 151 | return false; 152 | } else { 153 | *mem_address = offset; 154 | return true; 155 | } 156 | } 157 | 158 | static bool get_vmptr(struct vcpu_ctx *vcpu, gpa_t *vmptr) 159 | { 160 | /* Attempt to read the VMXON/VMCS region for the host. */ 161 | vmx_vmexit_instruction_info_vmx_and_xsaves info; 162 | info.flags = __vmread(VMCS_EXIT_INSTR_INFO); 163 | 164 | uintptr_t qual = __vmread(VMCS_EXIT_QUALIFICATION); 165 | DEBUG_PRINT("VMXON instruction info 0x%lX qual 0x%lX", info.flags, qual); 166 | 167 | /* Get the address of the VMXON pointer in guest virtual memory. */ 168 | uintptr_t guest_addr; 169 | if (!get_vmx_mem_address(vcpu, qual, info, &guest_addr)) 170 | return false; 171 | 172 | cr3 guest_cr3; 173 | guest_cr3.flags = __vmread(VMCS_GUEST_CR3); 174 | if (!mem_copy_virt_tofrom_host(COPY_READ, guest_cr3, guest_addr, vmptr, sizeof(*vmptr))) { 175 | DEBUG_PRINT("Unable to read guest memory for VMXON pointer"); 176 | return false; 177 | } 178 | 179 | DEBUG_PRINT("VMX guest mem address 0x%lX vmptr 0x%lX", guest_addr, *vmptr); 180 | return true; 181 | } 182 | 183 | static bool nested_rdmsr(struct vcpu_ctx *vcpu, size_t msr, size_t *value) 184 | { 185 | (void)vcpu; 186 | 187 | switch (msr) { 188 | case IA32_VMX_BASIC: 189 | ia32_vmx_basic_register basic = { 0 }; 190 | basic.vmcs_revision_id = NESTED_REVISION_ID; 191 | basic.vmcs_size_in_bytes = PAGE_SIZE; 192 | basic.memory_type = MEMORY_TYPE_WB; 193 | basic.ins_outs_vmexit_information = true; 194 | basic.true_controls = true; 195 | *value = basic.flags; 196 | break; 197 | default: 198 | return false; 199 | } 200 | 201 | return true; 202 | } 203 | 204 | static bool nested_wrmsr(struct vcpu_ctx *vcpu, size_t msr, size_t value) 205 | { 206 | (void)vcpu; 207 | (void)msr; 208 | (void)value; 209 | return false; 210 | } 211 | 212 | static bool nested_mov_crx(struct vcpu_ctx *vcpu, bool *move_to_next) 213 | { 214 | (void)vcpu; 215 | 216 | /* Verify to see whether the MOV CRX is relevant to nested. */ 217 | vmx_exit_qualification_cr_access qual; 218 | qual.flags = __vmread(VMCS_EXIT_QUALIFICATION); 219 | DEBUG_PRINT("Exit qualification 0x%lX", qual.flags); 220 | 221 | if (is_attempt_enable_vmx(vcpu, qual)) { 222 | *move_to_next = true; 223 | return true; 224 | } 225 | 226 | *move_to_next = false; 227 | return false; 228 | } 229 | 230 | static bool nested_vmxon(struct vcpu_ctx *vcpu, bool *move_to_next) 231 | { 232 | static const exception_error_code DEFAULT_EC = { 0 }; 233 | 234 | /* Verify that VMXE is enabled for the guest. */ 235 | if (!(__vmread(VMCS_GUEST_CR4) & CR4_VMXE_MASK)) { 236 | vmm_inject_guest_event(invalid_opcode, DEFAULT_EC); 237 | *move_to_next = false; 238 | return true; 239 | } 240 | 241 | /* Verify that we are CPL 0. */ 242 | segment_selector cs; 243 | cs.flags = __vmread(VMCS_GUEST_CS_SEL); 244 | if (cs.request_privilege_level != 0) { 245 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 246 | *move_to_next = false; 247 | return true; 248 | } 249 | 250 | /* Attempt to get the VMXON pointer. */ 251 | gpa_t vmptr; 252 | if (!get_vmptr(vcpu, &vmptr)) { 253 | *move_to_next = false; 254 | return true; 255 | } 256 | 257 | if (!vmptr) { 258 | DEBUG_PRINT("Unlikely null vmptr region, indicating failure."); 259 | set_vmx_fail_invalid_flags(); 260 | *move_to_next = true; 261 | return true; 262 | } 263 | 264 | /* Ensure that the guest VMXON pointer is page aligned. */ 265 | if (vmptr & PAGE_MASK) { 266 | DEBUG_PRINT("Non page aligned vmxon region defined 0x%lX", vmptr); 267 | set_vmx_fail_invalid_flags(); 268 | *move_to_next = true; 269 | return true; 270 | } 271 | 272 | /* Verify VMXON VMCS revision ID matches. */ 273 | uint32_t revision = *(uint32_t *)vmptr; 274 | 275 | /* 276 | * vmptr is a physical address of the guest, as we're identity mapped this is 277 | * mapped 1:1 into our virtual address space. No need to do conversion from 278 | * virtual guest to physical host. 279 | */ 280 | if (revision != NESTED_REVISION_ID) { 281 | DEBUG_PRINT("Guest specified revision 0x%X does not match host supported 0x%X", 282 | revision, NESTED_REVISION_ID); 283 | set_vmx_fail_invalid_flags(); 284 | *move_to_next = true; 285 | return true; 286 | } else { 287 | DEBUG_PRINT("Guest vmcs revision 0x%X", revision); 288 | } 289 | 290 | /* TODO: Allocate the vmxon fields for nested support. */ 291 | vcpu->nested->vmxon_ptr = vmptr; 292 | set_vmx_success_flags(); 293 | 294 | *move_to_next = true; 295 | return true; 296 | } 297 | 298 | static void handle_rdmsr(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 299 | { 300 | static const exception_error_code DEFAULT_EC = { 0 }; 301 | 302 | (void)opaque; 303 | 304 | size_t msr = (uint32_t)vcpu->guest_context.rcx; 305 | 306 | /* Check to see if valid RPL to perform the read. */ 307 | segment_selector cs; 308 | cs.flags = __vmread(VMCS_GUEST_CS_SEL); 309 | if (cs.request_privilege_level != 0) { 310 | DEBUG_PRINT("RDMSR 0x%lX wrong RPL 0x%X", msr, cs.request_privilege_level); 311 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 312 | *move_to_next = false; 313 | return; 314 | } 315 | 316 | /* Check to see if within valid MSR range. */ 317 | if ((msr && (msr <= 0x1FFF)) || ((msr >= 0xC0000000) && (msr <= 0xC0001FFF))) { 318 | 319 | /* Attempt to offload to nested handler. */ 320 | size_t value; 321 | if (!nested_rdmsr(vcpu, msr, &value)) 322 | die_on(true, "Unhandled MSR read 0x%lX at rip 0x%lX", msr, vcpu->guest_context.rip); 323 | 324 | vcpu->guest_context.rax = (uint32_t)value; 325 | vcpu->guest_context.rdx = value >> 32; 326 | DEBUG_PRINT("Guest read MSR 0x%lX spoofed value 0x%lX real 0x%lX", 327 | msr, value, rdmsr(msr)); 328 | 329 | *move_to_next = true; 330 | return; 331 | } 332 | 333 | /* Invalid MSR which is out of range. */ 334 | DEBUG_PRINT("RDMSR 0x%lX out of range", msr); 335 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 336 | *move_to_next = false; 337 | } 338 | 339 | static void handle_wrmsr(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 340 | { 341 | static const exception_error_code DEFAULT_EC = { 0 }; 342 | 343 | (void)opaque; 344 | 345 | size_t msr_id = (uint32_t)vcpu->guest_context.rcx; 346 | size_t msr_val = (vcpu->guest_context.rdx << 32) | (uint32_t)vcpu->guest_context.rax; 347 | 348 | /* Check to see if valid RPL to perform the read. */ 349 | segment_selector cs; 350 | cs.flags = __vmread(VMCS_GUEST_CS_SEL); 351 | if (cs.request_privilege_level != 0) { 352 | DEBUG_PRINT("WRMSR 0x%lX 0x%lX wrong RPL 0x%X", msr_id, msr_val, cs.request_privilege_level); 353 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 354 | *move_to_next = false; 355 | return; 356 | } 357 | 358 | /* Check to see if within valid MSR range. */ 359 | if ((msr_id && (msr_id <= 0x1FFF)) || ((msr_id >= 0xC0000000) && (msr_id <= 0xC0001FFF))) { 360 | 361 | /* Attempt to offload to nested handler. */ 362 | if (!nested_wrmsr(vcpu, msr_id, msr_val)) 363 | die_on(true, "Unhandled MSR write 0x%lX value at rip 0x%lX", 364 | msr_id, msr_val, vcpu->guest_context.rip); 365 | 366 | DEBUG_PRINT("Guest write MSR 0x%lX value 0x%lX", msr_id, msr_val); 367 | 368 | *move_to_next = true; 369 | return; 370 | } 371 | 372 | /* Invalid MSR which is out of range. */ 373 | DEBUG_PRINT("WRMSR 0x%lX 0x%lX out of range", msr_id, msr_val); 374 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 375 | *move_to_next = false; 376 | return; 377 | } 378 | 379 | static void handle_mov_crx(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 380 | { 381 | (void)opaque; 382 | die_on(!nested_mov_crx(vcpu, move_to_next), "Unable to handle MOV_CRX"); 383 | } 384 | 385 | static void handle_vmxon(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 386 | { 387 | (void)opaque; 388 | die_on(!nested_vmxon(vcpu, move_to_next), "Unable to handle VMXON"); 389 | } 390 | 391 | void nested_init(struct vmm_ctx *vmm) 392 | { 393 | handler_register_exit(vmm->handler, VMX_EXIT_REASON_RDMSR, handle_rdmsr, NULL, false); 394 | handler_register_exit(vmm->handler, VMX_EXIT_REASON_WRMSR, handle_wrmsr, NULL, false); 395 | handler_register_exit(vmm->handler, VMX_EXIT_REASON_MOV_CRX, handle_mov_crx, NULL, false); 396 | handler_register_exit(vmm->handler, VMX_EXIT_REASON_VMXON, handle_vmxon, NULL, false); 397 | } 398 | 399 | void nested_init_vcpu(struct vcpu_ctx *vcpu) 400 | { 401 | /* 402 | * Adjust the MSR bitmap to indicate which nested VMX related 403 | * MSRs we need to trap on. 404 | */ 405 | vmm_msr_trap_enable(vcpu->msr_trap_bitmap, IA32_VMX_BASIC, true); 406 | } -------------------------------------------------------------------------------- /hypervisor/vmm/ept.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include 3 | #include "platform/standard.h" 4 | #include "platform/intrin.h" 5 | #include "memory/vmem.h" 6 | #include "memory/mem.h" 7 | #include "ept.h" 8 | #include "ia32_compact.h" 9 | 10 | #define ENTRIES_PER_TABLE 512 11 | 12 | struct mtrr_data { 13 | bool valid; 14 | bool is_fixed; 15 | uint8_t type; 16 | size_t phys_base_min; 17 | size_t phys_base_max; 18 | }; 19 | 20 | struct ept_ctx { 21 | /* Describes 512 contiguous 512GiB memory regions. */ 22 | __attribute__((aligned(PAGE_SIZE))) epml4e pml4[ENTRIES_PER_TABLE]; 23 | /* Describes exactly 512 contiguous 1GiB memory regions with a singular PML4 region. */ 24 | __attribute__((aligned(PAGE_SIZE))) ept_pdpte pml3[ENTRIES_PER_TABLE]; 25 | /* For each 1GB PML3 entry, create 512 2MB regions. 26 | * We are using 2MB pages as the smallest paging size in the map so that we do not need 27 | * to allocate individual 4096 PML1 paging structures. */ 28 | __attribute__((aligned(PAGE_SIZE))) ept_pde_2mb pml2[ENTRIES_PER_TABLE][ENTRIES_PER_TABLE]; 29 | 30 | /* The EPT pointer for this context. */ 31 | eptp ept_ptr; 32 | 33 | /* List of MTRR data, not all may be valid as each processor arch 34 | * can vary depending on the amount of MTRRs implemented. */ 35 | struct mtrr_data mtrr[IA32_MTRR_COUNT]; 36 | size_t mtrr_count; 37 | uint8_t def_mem_type; 38 | }; 39 | 40 | struct ept_split_page { 41 | /* The PML1E/EPT_PTE table. */ 42 | __attribute__((aligned(PAGE_SIZE))) ept_pte pml1[ENTRIES_PER_TABLE]; 43 | 44 | /* A back reference to the PML2 entry which this was created for. */ 45 | ept_pde_2mb *pml2e_ref; 46 | }; 47 | 48 | static void gather_fixed_mtrr(struct ept_ctx *ctx) 49 | { 50 | struct fixed_mtrr_info { 51 | uint64_t msr_id; 52 | uintptr_t base_address; 53 | size_t managed_size; 54 | }; 55 | 56 | typedef union { 57 | struct { 58 | uint8_t types[8]; 59 | } u; 60 | uint64_t flags; 61 | } ia32_mtrr_fixed_range_msr; 62 | 63 | static const struct fixed_mtrr_info FIXED_INFO[] = { 64 | { IA32_MTRR_FIX64K_00000, 0x0, 0x10000, }, 65 | { IA32_MTRR_FIX16K_80000, 0x80000, 0x4000, }, 66 | { IA32_MTRR_FIX16K_A0000, 0xA0000, 0x4000, }, 67 | { IA32_MTRR_FIX4K_C0000, 0xC0000, 0x1000, }, 68 | { IA32_MTRR_FIX4K_C8000, 0xC8000, 0x1000, }, 69 | { IA32_MTRR_FIX4K_D0000, 0xD0000, 0x1000, }, 70 | { IA32_MTRR_FIX4K_D8000, 0xD8000, 0x1000, }, 71 | { IA32_MTRR_FIX4K_E0000, 0xE0000, 0x1000, }, 72 | { IA32_MTRR_FIX4K_E8000, 0xE8000, 0x1000, }, 73 | { IA32_MTRR_FIX4K_F0000, 0xF0000, 0x1000, }, 74 | { IA32_MTRR_FIX4K_F8000, 0xF8000, 0x1000, }, 75 | }; 76 | 77 | ia32_mtrrcap_register caps = { .flags = rdmsr(IA32_MTRRCAP) }; 78 | ia32_mtrr_def_type_register def_type = { .flags = rdmsr(IA32_MTRR_DEF_TYPE) }; 79 | 80 | /* Store the default memory type, for all regions not covered by a MTRR. */ 81 | ctx->def_mem_type = def_type.default_memory_type; 82 | 83 | if (!caps.fixed_range_registers_supported || !def_type.fixed_range_mtrr_enable) 84 | return; 85 | 86 | struct mtrr_data *last_valid = NULL; 87 | 88 | for (size_t i = 0; i < ARRAY_SIZE(FIXED_INFO); i++) { 89 | const struct fixed_mtrr_info *curr_range = &FIXED_INFO[i]; 90 | 91 | ia32_mtrr_fixed_range_msr fixed_range = {.flags = rdmsr(curr_range->msr_id) }; 92 | 93 | for (size_t j = 0; j < ARRAY_SIZE(fixed_range.u.types); j++) { 94 | uint8_t mem_type = fixed_range.u.types[j]; 95 | uint64_t range_begin = curr_range->base_address + (curr_range->managed_size * j); 96 | uint64_t range_end = range_begin + curr_range->managed_size; 97 | 98 | /* 99 | * Check to see if we can combine it with previous. 100 | * For this to be true, it must be of the same memory type 101 | * and also be contiguous. 102 | * 103 | * This will make it easier & quicker for when we do searching. 104 | */ 105 | if (last_valid && (last_valid->type == mem_type) && 106 | (last_valid->phys_base_max == range_begin)) { 107 | 108 | last_valid->phys_base_max += curr_range->managed_size; 109 | // DEBUG_PRINT("Extended last fixed entry to phys_based_min=0x%lX phys_base_max=0x%lX", 110 | // last_valid->phys_base_min, last_valid->phys_base_max); 111 | } else { 112 | struct mtrr_data *new_entry = &ctx->mtrr[ctx->mtrr_count]; 113 | new_entry->valid = true; 114 | new_entry->is_fixed = true; 115 | new_entry->phys_base_min = range_begin; 116 | new_entry->phys_base_max = range_end; 117 | new_entry->type = mem_type; 118 | 119 | // DEBUG_PRINT("Adding fixed entry phys_base_min=0x%lX phys_base_max=0x%lX type=0x%lX", 120 | // new_entry->phys_base_min, new_entry->phys_base_max, new_entry->type); 121 | 122 | last_valid = new_entry; 123 | ctx->mtrr_count++; 124 | } 125 | } 126 | } 127 | } 128 | 129 | static void gather_variable_mtrr(struct ept_ctx *ctx) 130 | { 131 | ia32_mtrrcap_register caps = { .flags = rdmsr(IA32_MTRRCAP) }; 132 | ia32_mtrr_physbase_register base; 133 | ia32_mtrr_physmask_register mask; 134 | 135 | 136 | struct mtrr_data *last_valid = NULL; 137 | 138 | for (size_t i = 0; i < caps.variable_range_registers_count; i++) { 139 | base.flags = rdmsr(IA32_MTRR_PHYSBASE0 + (i * 2)); 140 | mask.flags = rdmsr(IA32_MTRR_PHYSMASK0 + (i * 2)); 141 | 142 | /* If the mtrr is valid, calculate the min and maximum ranges. */ 143 | if (mask.valid) { 144 | 145 | /* 146 | * __builtin_ffsll returns 1 + the index of the least significate 1-bit of x. 147 | * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html 148 | */ 149 | long long bit_idx = __builtin_ffsll(mask.physical_addres_mask) - 1; 150 | size_t size_in_pages = (1ull << bit_idx); 151 | 152 | uint8_t mem_type = base.type; 153 | uint64_t range_begin = base.physical_addres_base * PAGE_SIZE; 154 | uint64_t range_size = size_in_pages * PAGE_SIZE; 155 | uint64_t range_end = range_begin + range_size; 156 | 157 | if (last_valid && (last_valid->type == mem_type) && 158 | (last_valid->phys_base_max == range_begin)) { 159 | 160 | last_valid->phys_base_max += range_size; 161 | // DEBUG_PRINT("Extended last variable entry to phys_base_min=0x%lX phys_base_max=0x%lX", 162 | // last_valid->phys_base_min, last_valid->phys_base_max); 163 | } else { 164 | struct mtrr_data *new_entry = &ctx->mtrr[ctx->mtrr_count]; 165 | new_entry->valid = true; 166 | new_entry->is_fixed = false; 167 | new_entry->phys_base_min = range_begin; 168 | new_entry->phys_base_max = range_end; 169 | new_entry->type = mem_type; 170 | 171 | // DEBUG_PRINT("Adding variable phys_base_min=0x%lX phys_base_max=0x%lX type=0x%lX", 172 | // new_entry->phys_base_min, new_entry->phys_base_max, new_entry->type); 173 | 174 | last_valid = new_entry; 175 | ctx->mtrr_count++; 176 | } 177 | } 178 | } 179 | } 180 | 181 | static void gather_mtrr_list(struct ept_ctx *ctx) 182 | { 183 | gather_fixed_mtrr(ctx); 184 | gather_variable_mtrr(ctx); 185 | 186 | /* Let's print our memory type information. */ 187 | DEBUG_PRINT("Default memory type=%d", ctx->def_mem_type); 188 | for (size_t i = 0; i < IA32_MTRR_COUNT; i++) { 189 | const struct mtrr_data *curr_mtrr = &ctx->mtrr[i]; 190 | 191 | if (curr_mtrr->valid) 192 | DEBUG_PRINT("Range begin=0x%016llX end=0x%016llX type=%d fixed=%d", 193 | curr_mtrr->phys_base_min, curr_mtrr->phys_base_max, 194 | curr_mtrr->type, curr_mtrr->is_fixed); 195 | } 196 | } 197 | 198 | static uint32_t calc_mem_type(struct ept_ctx *ctx, uintptr_t phys_begin, size_t phys_size) 199 | { 200 | /* 201 | * Iterate all of the MTRRs we have defined, and check to see if they match any of 202 | * the range which we have specified, if so we then use that MTRR's value. 203 | */ 204 | uint32_t mem_type = MEMORY_TYPE_INVALID; 205 | 206 | for (size_t i = 0; i < IA32_MTRR_COUNT; i++) { 207 | const struct mtrr_data *curr_mtrr = &ctx->mtrr[i]; 208 | 209 | /* Filter out invalid/empty entries. */ 210 | if (!curr_mtrr->valid) 211 | continue; 212 | 213 | /* If out of range, let's skip. */ 214 | if ((phys_begin < curr_mtrr->phys_base_min) || 215 | (phys_begin >= curr_mtrr->phys_base_max)) 216 | continue; 217 | 218 | /* If the range of our phys region is larger than defined in the MTRR throw an error. */ 219 | if ((phys_begin + phys_size - 1) >= curr_mtrr->phys_base_max) 220 | return mem_type; 221 | 222 | /* Fixed MTRRs take precedence over all others. */ 223 | if (curr_mtrr->is_fixed) 224 | return curr_mtrr->type; 225 | 226 | /* Uncacheable takes next precedence. */ 227 | if (curr_mtrr->type == MEMORY_TYPE_UC) 228 | return curr_mtrr->type; 229 | 230 | /* Writethrough always takes precedence over writeback memory. */ 231 | if (((curr_mtrr->type == MEMORY_TYPE_WT) && (mem_type == MEMORY_TYPE_WB)) || 232 | ((curr_mtrr->type == MEMORY_TYPE_WB) && (mem_type == MEMORY_TYPE_WT))) { 233 | mem_type = MEMORY_TYPE_WT; 234 | } 235 | 236 | /* Anything else, just set to last matched. */ 237 | mem_type = curr_mtrr->type; 238 | } 239 | 240 | /* If we didn't find the value in the MTRR list then just use the default. */ 241 | if (mem_type == MEMORY_TYPE_INVALID) 242 | mem_type = ctx->def_mem_type; 243 | 244 | return mem_type; 245 | } 246 | 247 | static void ept_split_large_page(struct ept_ctx *ctx, uintptr_t phys_addr) 248 | { 249 | /* Attempt to get the PML2E for the physical address specified. */ 250 | ept_pde_2mb *target_pml2e = get_ept_pml2e(ctx, phys_addr); 251 | die_on(!target_pml2e, "Invalid PML2E for addr 0x%lX", phys_addr); 252 | 253 | /* If the large page bit isn't set, this means already split. */ 254 | if (!target_pml2e->large_page) 255 | return; 256 | 257 | struct ept_split_page *new_split = vmem_alloc(sizeof(struct ept_split_page), MEM_WRITE); 258 | die_on(!new_split, "Unable to allocate memory for split page, phys_addr 0x%lX", phys_addr); 259 | 260 | /* Store the back reference to the PML2E */ 261 | new_split->pml2e_ref = target_pml2e; 262 | 263 | /* Now create a stub/template PML1E with default params. */ 264 | ept_pte temp_pte = { 265 | .read_access = true, 266 | .write_access = true, 267 | .execute_access = true, 268 | .ignore_pat = target_pml2e->ignore_pat, 269 | .suppress_ve = target_pml2e->suppress_ve 270 | }; 271 | 272 | die_on(temp_pte.memory_type == MEMORY_TYPE_INVALID, 273 | "Memory type for 0x%lX is invalid even with splitting.", phys_addr); 274 | 275 | /* Calculate the physical address of the original PML2 entry. 276 | * and the page frame number, we will use this as base. */ 277 | uintptr_t base_pml2e = target_pml2e->page_frame_number * MiB(2); 278 | uintptr_t base_pfn = base_pml2e / PAGE_SIZE; 279 | 280 | /* Now fill out all the new PML1E's for the table. 281 | * Use the template PTE for the general flags, 282 | * but we will also need to update the PFN. 283 | * As we have calculated the original PFN for the PML2E we can 284 | * we can just add one page for each entry. */ 285 | for (int i = 0; i < ENTRIES_PER_TABLE; i++) { 286 | size_t curr_pfn = base_pfn + i; 287 | uintptr_t curr_phys = curr_pfn * PAGE_SIZE; 288 | 289 | new_split->pml1[i] = temp_pte; 290 | new_split->pml1[i].memory_type = calc_mem_type(ctx, curr_phys, PAGE_SIZE); 291 | new_split->pml1[i].page_frame_number = curr_pfn; 292 | } 293 | 294 | /* Now create a new PML2E entry to replace the old one. */ 295 | cr3 this_cr3; 296 | this_cr3.flags = __readcr3(); 297 | 298 | uintptr_t phys_pml1e = mem_va_to_pa(this_cr3, &new_split->pml1[0]); 299 | 300 | ept_pde new_pde = { 301 | .read_access = true, 302 | .write_access = true, 303 | .execute_access = true, 304 | .page_frame_number = (phys_pml1e / PAGE_SIZE) 305 | }; 306 | target_pml2e->flags = new_pde.flags; 307 | } 308 | 309 | struct ept_ctx *ept_init(void) 310 | { 311 | /* Initialise SLAT/EPT for the guest. 312 | * As we are hyperjacking this will consist of a 313 | * 1:1 guest/host identity map to aid in conversion. */ 314 | struct ept_ctx *ctx = vmem_alloc(sizeof(struct ept_ctx), MEM_WRITE); 315 | die_on(!ctx, "Unable to allocate context for EPT."); 316 | 317 | /* Gather the MTRR layout list so that this can be used in the future. */ 318 | gather_mtrr_list(ctx); 319 | 320 | /* Configure the EPT pointer. */ 321 | cr3 this_cr3; 322 | this_cr3.flags = __readcr3(); 323 | 324 | uintptr_t phys_pml4 = mem_va_to_pa(this_cr3, ctx->pml4); 325 | ctx->ept_ptr.page_walk_length = 3; 326 | ctx->ept_ptr.memory_type = MEMORY_TYPE_WB; 327 | ctx->ept_ptr.page_frame_number = phys_pml4 / PAGE_SIZE; 328 | 329 | /* Fill out the first top level 512GiB entry. 330 | * We don't need to do the others as it's HIGHLY unlikely 331 | * that this will ever be ran on a 512GiB system. */ 332 | uintptr_t phys_pml3 = mem_va_to_pa(this_cr3, ctx->pml3); 333 | ctx->pml4[0].read_access = true; 334 | ctx->pml4[0].write_access = true; 335 | ctx->pml4[0].execute_access = true; 336 | ctx->pml4[0].page_frame_number = phys_pml3 / PAGE_SIZE; 337 | 338 | /* Configure the lower level PML3 table, 339 | * each entry indicates 1GiB of physical memory, 340 | * therefore the first 512GiB is identity mapped. */ 341 | for (int i = 0; i < ENTRIES_PER_TABLE; i++) { 342 | ctx->pml3[i].read_access = true; 343 | ctx->pml3[i].write_access = true; 344 | ctx->pml3[i].execute_access = true; 345 | 346 | uintptr_t phys_pml2 = mem_va_to_pa(this_cr3, &ctx->pml2[i][0]); 347 | ctx->pml3[i].page_frame_number = phys_pml2 / PAGE_SIZE; 348 | } 349 | 350 | /* Loop every 1 GiB of RAM (PML3). */ 351 | for (int i = 0; i < ENTRIES_PER_TABLE; i++) { 352 | /* Loop every 2 MiB within that GiB. */ 353 | for (int j = 0; j < ENTRIES_PER_TABLE; j++) { 354 | ctx->pml2[i][j].read_access = true; 355 | ctx->pml2[i][j].write_access = true; 356 | ctx->pml2[i][j].execute_access = true; 357 | ctx->pml2[i][j].large_page = true; 358 | ctx->pml2[i][j].page_frame_number = (i * ENTRIES_PER_TABLE) + j; 359 | 360 | uintptr_t phys_addr = ctx->pml2[i][j].page_frame_number * MiB(2); 361 | 362 | /* Calculate the memory type for this entry. */ 363 | uint32_t mem_type = calc_mem_type(ctx, phys_addr, MiB(2)); 364 | 365 | /* 366 | * If the memory type is invalid (too large but semi in a 2MiB page) 367 | * Then let's split the it into smaller PML1E entries and then re-calc 368 | * the memory type individually. 369 | */ 370 | if (mem_type != MEMORY_TYPE_INVALID) { 371 | ctx->pml2[i][j].memory_type = mem_type; 372 | } else { 373 | DEBUG_PRINT("Memory type for 0x%lX is invalid, splitting.", phys_addr); 374 | ept_split_large_page(ctx, phys_addr); 375 | } 376 | } 377 | } 378 | 379 | return ctx; 380 | } 381 | 382 | eptp *ept_get_pointer(struct ept_ctx *ctx) 383 | { 384 | return &ctx->ept_ptr; 385 | } 386 | 387 | ept_pde_2mb *get_ept_pml2e(struct ept_ctx *ctx, uintptr_t phys_addr) 388 | { 389 | uint64_t pml4_idx = ADDRMASK_EPT_PML4_INDEX(phys_addr); 390 | die_on(pml4_idx, "Cannot support PML4E[%d] above 0 (512GiB)", pml4_idx); 391 | 392 | uint64_t pml3_idx = ADDRMASK_EPT_PML3_INDEX(phys_addr); 393 | uint64_t pml2_idx = ADDRMASK_EPT_PML2_INDEX(phys_addr); 394 | return &ctx->pml2[pml3_idx][pml2_idx]; 395 | } 396 | 397 | ept_pte *ept_get_pml1e(struct ept_ctx *ctx, uintptr_t phys_addr) 398 | { 399 | /* First get the PML2E. 400 | * If the current page is a large page (2MiB) then we 401 | * should proceed with splitting this PML2E into standard 402 | * pages. From there we can then return that value. */ 403 | ept_pde_2mb *target_pml2e_2mb = get_ept_pml2e(ctx, phys_addr); 404 | die_on(!target_pml2e_2mb, "Invalid PML2E for addr 0x%lX", phys_addr); 405 | 406 | if (target_pml2e_2mb->large_page) { 407 | /* Split the page, and then ensure we invalidate and flush the 408 | * EPT cache. */ 409 | ept_split_large_page(ctx, phys_addr); 410 | ept_invalidate_and_flush(ctx); 411 | } 412 | 413 | /* Now re-cast the PML2E/PDE as it should be split, 414 | * then we can just return the correct value. */ 415 | ept_pde *target_pml2e = (ept_pde *)target_pml2e_2mb; 416 | 417 | ept_pte *pml1_table = (ept_pte *)((uintptr_t)target_pml2e->page_frame_number * PAGE_SIZE); 418 | return &pml1_table[ADDRMASK_EPT_PML1_INDEX(phys_addr)]; 419 | } 420 | 421 | void ept_invalidate_and_flush(struct ept_ctx *ctx) 422 | { 423 | invept_descriptor desc = { 424 | .ept_pointer = ctx->ept_ptr.flags, 425 | .reserved = 0 426 | }; 427 | __invept(invvpid_all_context, &desc); 428 | } -------------------------------------------------------------------------------- /hypervisor/vmm/handler.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include "platform/intrin.h" 3 | #include "memory/mem.h" 4 | #include "memory/vmem.h" 5 | #include "interrupt/idt.h" 6 | #include "vmm_common.h" 7 | #include "handler.h" 8 | #include "ia32_compact.h" 9 | 10 | struct vmexit_handler { 11 | /* Doubly linked list so multiple exit handlers can be daisy chained. */ 12 | struct vmexit_handler *next, *prev; 13 | /* The callback for the exit to be called. */ 14 | vmexit_cbk_t callback; 15 | /* Callback specific data. */ 16 | void *opaque; 17 | /* If called, prevent other daisy chained callbacks from being called. */ 18 | bool override; 19 | }; 20 | 21 | struct handler_ctx { 22 | /* An array of exit handler structures. 23 | * Each VMEXIT can have multiple handlers daisy chained. 24 | * therefore we have to keep track of pointers. 25 | * 26 | * We can store up to the maximum exit reason which is XRSTORS 27 | * (well at least currently). */ 28 | #define MAX_EXIT_HANDLERS VMX_EXIT_REASON_XRSTORS 29 | struct vmexit_handler *handlers[MAX_EXIT_HANDLERS]; 30 | /* Back reference to the VMM context */ 31 | struct vmm_ctx *vmm; 32 | }; 33 | 34 | static void handle_cached_interrupts(struct vcpu_ctx *vcpu) 35 | { 36 | /* 37 | * Check to see if there are any pending interrupts 38 | * to be delivered that were caught from the host IDT 39 | * that need to be redirected to the guest. 40 | * 41 | * We only do this if there is NOT already a pending 42 | * interrupt. 43 | */ 44 | if (vcpu->cached_int.pending) { 45 | DEBUG_PRINT("Forwarding vector 0x%lX error code 0x%lX", 46 | vcpu->cached_int.vector, vcpu->cached_int.code); 47 | vmm_inject_guest_event(vcpu->cached_int.vector, vcpu->cached_int.code); 48 | vcpu->cached_int.pending = false; 49 | } 50 | } 51 | 52 | static void handle_cpuid(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 53 | { 54 | /* Override leafs. */ 55 | #define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 56 | #define HYPERV_CPUID_INTERFACE 0x40000001 57 | 58 | /* Bitmasks in certain leafs. */ 59 | static const size_t CPUID_VI_BIT_HYPERVISOR_PRESENT = 0x80000000; 60 | 61 | (void)opaque; 62 | 63 | /* Read the CPUID into the leafs array. */ 64 | uint64_t leaf = vcpu->guest_context.rax; 65 | uint64_t sub_leaf = vcpu->guest_context.rcx; 66 | int out_regs[4] = { 0 }; 67 | __cpuidex(out_regs, leaf, sub_leaf); 68 | 69 | /* Override certain target leafs. */ 70 | #pragma GCC diagnostic push 71 | #pragma GCC diagnostic ignored "-Wmultichar" 72 | switch (leaf) { 73 | case CPUID_VERSION_INFO: 74 | out_regs[2] &= ~(uint64_t)CPUID_VI_BIT_HYPERVISOR_PRESENT; 75 | break; 76 | case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: 77 | out_regs[0] = HYPERV_CPUID_INTERFACE; 78 | out_regs[1] = 'csac'; 79 | out_regs[2] = '\0eda'; 80 | out_regs[3] = '\0\0\0\0'; 81 | break; 82 | case HYPERV_CPUID_INTERFACE: 83 | out_regs[0] = 'csac'; 84 | out_regs[1] = 0; 85 | out_regs[2] = 0; 86 | out_regs[3] = 0; 87 | break; 88 | } 89 | #pragma GCC diagnostic pop 90 | 91 | DEBUG_PRINT("CPUID leaf 0x%lX sub_leaf 0x%lX - 0x%lX 0x%lX 0x%lX 0x%lX", 92 | leaf, sub_leaf, out_regs[0], out_regs[1], out_regs[2], out_regs[3]); 93 | 94 | /* Store these leafs back into the guest context and move to next. */ 95 | vcpu->guest_context.rax = out_regs[0]; 96 | vcpu->guest_context.rbx = out_regs[1]; 97 | vcpu->guest_context.rcx = out_regs[2]; 98 | vcpu->guest_context.rdx = out_regs[3]; 99 | *move_to_next = true; 100 | } 101 | 102 | static void handle_xsetbv(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 103 | { 104 | static const exception_error_code DEFAULT_EC = { 0 }; 105 | 106 | (void)opaque; 107 | 108 | /* Check to ensure that os_xsave is enabled. */ 109 | cr4 guest_cr4; 110 | guest_cr4.flags = __vmread(VMCS_GUEST_CR4); 111 | if (!guest_cr4.os_xsave) { 112 | DEBUG_PRINT("XSETBV when CR4.os_xsave not set"); 113 | vmm_inject_guest_event(invalid_opcode, DEFAULT_EC); 114 | *move_to_next = false; 115 | return; 116 | } 117 | 118 | /* Check that a valid XCR index is set (only 0 supported). */ 119 | uint32_t field = (uint32_t)vcpu->guest_context.rcx; 120 | if (field) { 121 | DEBUG_PRINT("XSETBV invalid XCR field 0x%X", field); 122 | vmm_inject_guest_event(general_protection, DEFAULT_EC); 123 | *move_to_next = false; 124 | return; 125 | } 126 | 127 | /* 128 | * Running XSETBV requires os_xsave to be set in CR4 129 | * this is not the cast in an EFI booted environment 130 | * so we enable it before the call. 131 | */ 132 | cr4 host_cr4; 133 | host_cr4.flags = __readcr4(); 134 | host_cr4.os_xsave = true; 135 | __writecr4(host_cr4.flags); 136 | 137 | uint64_t value = (vcpu->guest_context.rdx << 32) | (uint32_t)vcpu->guest_context.rax; 138 | 139 | DEBUG_PRINT("XSETBV field 0x%lX value 0x%lX", field, value); 140 | __xsetbv(field, value); 141 | 142 | *move_to_next = true; 143 | } 144 | 145 | static void handle_invd(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 146 | { 147 | (void)vcpu; 148 | (void)opaque; 149 | 150 | DEBUG_PRINT("INVD"); 151 | __invd(); 152 | *move_to_next = true; 153 | } 154 | 155 | static void handle_init_signal(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 156 | { 157 | (void)opaque; 158 | 159 | /* Control registers. */ 160 | cr0 guest_cr0 = { 161 | .extension_type = true, 162 | .numeric_error = true, 163 | .not_write_through = true, 164 | .cache_disable = true 165 | }; 166 | 167 | __vmwrite(VMCS_GUEST_CR0, guest_cr0.flags); 168 | 169 | __vmwrite(VMCS_GUEST_CR3, 0); 170 | 171 | cr4 guest_cr4 = { 172 | .vmx_enable = true 173 | }; 174 | __vmwrite(VMCS_GUEST_CR4, guest_cr4.flags); 175 | 176 | /* Configure the GDT entries & access rights. */ 177 | vmx_segment_access_rights guest_ar = { 0 }; 178 | guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_CODE_ERA; 179 | guest_ar.descriptor_type = true; 180 | guest_ar.present = true; 181 | 182 | __vmwrite(VMCS_GUEST_CS_SEL, 0xf000); 183 | __vmwrite(VMCS_GUEST_CS_BASE, 0xffff0000); 184 | __vmwrite(VMCS_GUEST_CS_LIMIT, 0xffff); 185 | __vmwrite(VMCS_GUEST_CS_ACCESS_RIGHTS, guest_ar.flags); 186 | 187 | guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_DATA_RWA; 188 | __vmwrite(VMCS_GUEST_SS_SEL, 0); 189 | __vmwrite(VMCS_GUEST_SS_BASE, 0); 190 | __vmwrite(VMCS_GUEST_SS_LIMIT, 0xffff); 191 | __vmwrite(VMCS_GUEST_SS_ACCESS_RIGHTS, guest_ar.flags); 192 | __vmwrite(VMCS_GUEST_DS_SEL, 0); 193 | __vmwrite(VMCS_GUEST_DS_BASE, 0); 194 | __vmwrite(VMCS_GUEST_DS_LIMIT, 0xffff); 195 | __vmwrite(VMCS_GUEST_DS_ACCESS_RIGHTS, guest_ar.flags); 196 | __vmwrite(VMCS_GUEST_ES_SEL, 0); 197 | __vmwrite(VMCS_GUEST_ES_BASE, 0); 198 | __vmwrite(VMCS_GUEST_ES_LIMIT, 0xffff); 199 | __vmwrite(VMCS_GUEST_ES_ACCESS_RIGHTS, guest_ar.flags); 200 | __vmwrite(VMCS_GUEST_FS_SEL, 0); 201 | __vmwrite(VMCS_GUEST_FS_BASE, 0); 202 | __vmwrite(VMCS_GUEST_FS_LIMIT, 0xffff); 203 | __vmwrite(VMCS_GUEST_FS_ACCESS_RIGHTS, guest_ar.flags); 204 | __vmwrite(VMCS_GUEST_GS_SEL, 0); 205 | __vmwrite(VMCS_GUEST_GS_BASE, 0); 206 | __vmwrite(VMCS_GUEST_GS_LIMIT, 0xffff); 207 | __vmwrite(VMCS_GUEST_GS_ACCESS_RIGHTS, guest_ar.flags); 208 | 209 | __vmwrite(VMCS_GUEST_GDTR_BASE, 0); 210 | __vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff); 211 | __vmwrite(VMCS_GUEST_IDTR_BASE, 0); 212 | __vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff); 213 | 214 | guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_LDT; 215 | guest_ar.descriptor_type = false; 216 | __vmwrite(VMCS_GUEST_LDTR_SEL, 0); 217 | __vmwrite(VMCS_GUEST_LDTR_BASE, 0); 218 | __vmwrite(VMCS_GUEST_LDTR_LIMIT, 0xffff); 219 | __vmwrite(VMCS_GUEST_LDTR_ACCESS_RIGHTS, guest_ar.flags); 220 | 221 | guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_TSS_BUSY; 222 | __vmwrite(VMCS_GUEST_TR_SEL, 0); 223 | __vmwrite(VMCS_GUEST_TR_BASE, 0); 224 | __vmwrite(VMCS_GUEST_TR_LIMIT, 0xffff); 225 | __vmwrite(VMCS_GUEST_TR_ACCESS_RIGHTS, guest_ar.flags); 226 | 227 | /* Configure some extra functionality. */ 228 | __vmwrite(VMCS_GUEST_EFER, 0); 229 | __vmwrite(VMCS_GUEST_FS_BASE, 0); 230 | __vmwrite(VMCS_GUEST_GS_BASE, 0); 231 | 232 | /* Configure the debug registers. */ 233 | // __writedr(0, 0); 234 | // __writedr(1, 0); 235 | // __writedr(2, 0); 236 | // __writedr(3, 0); 237 | // __writedr(6, 0xffff0ff0); 238 | __vmwrite(VMCS_GUEST_DR7, 0x400); 239 | 240 | /* Configure the guest context. */ 241 | struct vcpu_context *guest_context = &vcpu->guest_context; 242 | 243 | cpuid_eax_01 version_info; 244 | CPUID_LEAF_READ(CPUID_VERSION_INFO, version_info); 245 | guest_context->rdx = 0x600 | ((uint64_t)version_info.eax.extended_model_id << 16); 246 | guest_context->rbx = 0; 247 | guest_context->rcx = 0; 248 | guest_context->rsi = 0; 249 | guest_context->rdi = 0; 250 | guest_context->rbp = 0; 251 | guest_context->r8 = 0; 252 | guest_context->r9 = 0; 253 | guest_context->r10 = 0; 254 | guest_context->r11 = 0; 255 | guest_context->r12 = 0; 256 | guest_context->r13 = 0; 257 | guest_context->r14 = 0; 258 | guest_context->r15 = 0; 259 | 260 | /* Configure instruction pointer, stack etc. */ 261 | rfl guest_rflags = { 262 | .read_as_1 = true 263 | }; 264 | 265 | __vmwrite(VMCS_GUEST_RFLAGS, guest_rflags.flags); 266 | __vmwrite(VMCS_GUEST_RIP, 0xfff0); 267 | __vmwrite(VMCS_GUEST_RSP, 0); 268 | 269 | /* Configure the entry controls. */ 270 | ia32_vmx_entry_ctls_register entry_ctls = { 271 | .flags = __vmread(VMCS_CTRL_ENTRY) 272 | }; 273 | entry_ctls.ia32e_mode_guest = false; 274 | __vmwrite(VMCS_CTRL_ENTRY, entry_ctls.flags); 275 | 276 | /* Indicate we are waiting for SIPI. */ 277 | __vmwrite(VMCS_GUEST_ACTIVITY_STATE, vmx_wait_for_sipi); 278 | *move_to_next = false; 279 | } 280 | 281 | static void handle_sipi(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next) 282 | { 283 | (void)vcpu; 284 | (void)opaque; 285 | 286 | uint64_t vector = __vmread(VMCS_EXIT_QUALIFICATION); 287 | 288 | __vmwrite(VMCS_GUEST_CS_SEL, vector << 8); 289 | __vmwrite(VMCS_GUEST_CS_BASE, vector << 12); 290 | __vmwrite(VMCS_GUEST_RIP, 0); 291 | 292 | __vmwrite(VMCS_GUEST_ACTIVITY_STATE, vmx_active); 293 | *move_to_next = false; 294 | } 295 | 296 | static void dump_guest_state(void) 297 | { 298 | // 299 | // 16-Bit Guest-State Fields 300 | // 301 | debug_print("Guest ES Selector = %016llx", __vmread(VMCS_GUEST_ES_SEL)); 302 | debug_print("Guest CS Selector = %016llx", __vmread(VMCS_GUEST_CS_SEL)); 303 | debug_print("Guest SS Selector = %016llx", __vmread(VMCS_GUEST_SS_SEL)); 304 | debug_print("Guest DS Selector = %016llx", __vmread(VMCS_GUEST_DS_SEL)); 305 | debug_print("Guest FS Selector = %016llx", __vmread(VMCS_GUEST_FS_SEL)); 306 | debug_print("Guest GS Selector = %016llx", __vmread(VMCS_GUEST_GS_SEL)); 307 | debug_print("Guest LDTR Selector = %016llx", __vmread(VMCS_GUEST_LDTR_SEL)); 308 | debug_print("Guest TR Selector = %016llx", __vmread(VMCS_GUEST_TR_SEL)); 309 | debug_print("Guest interrupt status = %016llx", __vmread(VMCS_GUEST_INTR_STATUS)); 310 | debug_print("PML index = %016llx", __vmread(VMCS_GUEST_PML_INDEX)); 311 | 312 | // 313 | // 64-Bit Guest-State Fields 314 | // 315 | debug_print("VMCS link pointer = %016llx", __vmread(VMCS_GUEST_VMCS_LINK_PTR)); 316 | debug_print("Guest IA32_DEBUGCTL = %016llx", __vmread(VMCS_GUEST_DEBUGCTL)); 317 | debug_print("Guest IA32_PAT = %016llx", __vmread(VMCS_GUEST_PAT)); 318 | debug_print("Guest IA32_EFER = %016llx", __vmread(VMCS_GUEST_EFER)); 319 | debug_print("Guest IA32_PERF_GLOBAL_CTRL = %016llx", __vmread(VMCS_GUEST_PERF_GLOBAL_CTRL)); 320 | debug_print("Guest PDPTE0 = %016llx", __vmread(VMCS_GUEST_PDPTE0)); 321 | debug_print("Guest PDPTE1 = %016llx", __vmread(VMCS_GUEST_PDPTE1)); 322 | debug_print("Guest PDPTE2 = %016llx", __vmread(VMCS_GUEST_PDPTE2)); 323 | debug_print("Guest PDPTE3 = %016llx", __vmread(VMCS_GUEST_PDPTE3)); 324 | debug_print("Guest IA32_BNDCFGS = %016llx", __vmread(VMCS_GUEST_BNDCFGS)); 325 | debug_print("Guest IA32_RTIT_CTL = %016llx", __vmread(VMCS_GUEST_RTIT_CTL)); 326 | 327 | // 328 | // 32-Bit Guest-State Fields 329 | // 330 | debug_print("Guest ES Limit = %016llx", __vmread(VMCS_GUEST_ES_LIMIT)); 331 | debug_print("Guest CS Limit = %016llx", __vmread(VMCS_GUEST_CS_LIMIT)); 332 | debug_print("Guest SS Limit = %016llx", __vmread(VMCS_GUEST_SS_LIMIT)); 333 | debug_print("Guest DS Limit = %016llx", __vmread(VMCS_GUEST_DS_LIMIT)); 334 | debug_print("Guest FS Limit = %016llx", __vmread(VMCS_GUEST_FS_LIMIT)); 335 | debug_print("Guest GS Limit = %016llx", __vmread(VMCS_GUEST_GS_LIMIT)); 336 | debug_print("Guest LDTR Limit = %016llx", __vmread(VMCS_GUEST_LDTR_LIMIT)); 337 | debug_print("Guest TR Limit = %016llx", __vmread(VMCS_GUEST_TR_LIMIT)); 338 | debug_print("Guest GDTR limit = %016llx", __vmread(VMCS_GUEST_GDTR_LIMIT)); 339 | debug_print("Guest IDTR limit = %016llx", __vmread(VMCS_GUEST_IDTR_LIMIT)); 340 | debug_print("Guest ES access rights = %016llx", __vmread(VMCS_GUEST_ES_ACCESS_RIGHTS)); 341 | debug_print("Guest CS access rights = %016llx", __vmread(VMCS_GUEST_CS_ACCESS_RIGHTS)); 342 | debug_print("Guest SS access rights = %016llx", __vmread(VMCS_GUEST_SS_ACCESS_RIGHTS)); 343 | debug_print("Guest DS access rights = %016llx", __vmread(VMCS_GUEST_DS_ACCESS_RIGHTS)); 344 | debug_print("Guest FS access rights = %016llx", __vmread(VMCS_GUEST_FS_ACCESS_RIGHTS)); 345 | debug_print("Guest GS access rights = %016llx", __vmread(VMCS_GUEST_GS_ACCESS_RIGHTS)); 346 | debug_print("Guest LDTR access rights = %016llx", __vmread(VMCS_GUEST_LDTR_ACCESS_RIGHTS)); 347 | debug_print("Guest TR access rights = %016llx", __vmread(VMCS_GUEST_TR_ACCESS_RIGHTS)); 348 | debug_print("Guest interruptibility state = %016llx", __vmread(VMCS_GUEST_INTERRUPTIBILITY_STATE)); 349 | debug_print("Guest activity state = %016llx", __vmread(VMCS_GUEST_ACTIVITY_STATE)); 350 | debug_print("Guest SMBASE = %016llx", __vmread(VMCS_GUEST_SMBASE)); 351 | debug_print("Guest IA32_SYSENTER_CS = %016llx", __vmread(VMCS_GUEST_SYSENTER_CS)); 352 | debug_print("VMX-preemption timer value = %016llx", __vmread(VMCS_GUEST_PREEMPT_TIMER_VALUE)); 353 | 354 | // 355 | // Natural-Width Guest-State Fields 356 | // 357 | debug_print("Guest CR0 = %016llx", __vmread(VMCS_GUEST_CR0)); 358 | debug_print("Guest CR3 = %016llx", __vmread(VMCS_GUEST_CR3)); 359 | debug_print("Guest CR4 = %016llx", __vmread(VMCS_GUEST_CR4)); 360 | debug_print("Guest ES Base = %016llx", __vmread(VMCS_GUEST_ES_BASE)); 361 | debug_print("Guest CS Base = %016llx", __vmread(VMCS_GUEST_CS_BASE)); 362 | debug_print("Guest SS Base = %016llx", __vmread(VMCS_GUEST_SS_BASE)); 363 | debug_print("Guest DS Base = %016llx", __vmread(VMCS_GUEST_DS_BASE)); 364 | debug_print("Guest FS Base = %016llx", __vmread(VMCS_GUEST_FS_BASE)); 365 | debug_print("Guest GS Base = %016llx", __vmread(VMCS_GUEST_GS_BASE)); 366 | debug_print("Guest LDTR base = %016llx", __vmread(VMCS_GUEST_LDTR_BASE)); 367 | debug_print("Guest TR base = %016llx", __vmread(VMCS_GUEST_TR_BASE)); 368 | debug_print("Guest GDTR base = %016llx", __vmread(VMCS_GUEST_GDTR_BASE)); 369 | debug_print("Guest IDTR base = %016llx", __vmread(VMCS_GUEST_IDTR_BASE)); 370 | debug_print("Guest DR7 = %016llx", __vmread(VMCS_GUEST_DR7)); 371 | debug_print("Guest RSP = %016llx", __vmread(VMCS_GUEST_RSP)); 372 | debug_print("Guest RIP = %016llx", __vmread(VMCS_GUEST_RIP)); 373 | debug_print("Guest RFLAGS = %016llx", __vmread(VMCS_GUEST_RFLAGS)); 374 | debug_print("Guest pending debug exceptions = %016llx", __vmread(VMCS_GUEST_PENDING_DEBUG_EXCEPTIONS)); 375 | debug_print("Guest IA32_SYSENTER_ESP = %016llx", __vmread(VMCS_GUEST_SYSENTER_ESP)); 376 | debug_print("Guest IA32_SYSENTER_EIP = %016llx", __vmread(VMCS_GUEST_SYSENTER_EIP)); 377 | } 378 | 379 | static void dump_guest_page(void) 380 | { 381 | static const int PRINT_PER_LINE = 32; 382 | uintptr_t guest_rip = __vmread(VMCS_GUEST_RIP); 383 | uint8_t *page_start = (uint8_t *)(guest_rip & ~0xFFFull); 384 | uint8_t *page_end = (uint8_t *)(page_start + PAGE_SIZE); 385 | 386 | debug_print("Dumping guest page of RIP=0x%lX 0x%lX-0x%lX", guest_rip, page_start, page_end); 387 | 388 | for (uint8_t* current = page_start; current < page_end; current += PRINT_PER_LINE) { 389 | print_buffer("0x%016lX ", current); 390 | for (int i = 0; i < PRINT_PER_LINE; i++) { 391 | print_buffer("%02X ", current[i]); 392 | } 393 | print_buffer("\r\n"); 394 | } 395 | print_buffer("\r\n"); 396 | } 397 | 398 | static void dump_state(struct vcpu_ctx *vcpu) 399 | { 400 | (void)vcpu; 401 | dump_guest_state(); 402 | dump_guest_page(); 403 | } 404 | 405 | static void handle_exit_reason(struct vcpu_ctx *vcpu) 406 | { 407 | /* Retrieve the handler context from the vCPU. */ 408 | struct handler_ctx *ctx = vcpu->vmm->handler; 409 | 410 | /* Determine the exit reason and then call the appropriate exit handler. */ 411 | size_t reason = __vmread(VMCS_EXIT_REASON) & 0xFFFF; 412 | bool move_to_next_instr = false; 413 | 414 | /* Check to see if the exit reason is out of range. */ 415 | die_on(reason >= MAX_EXIT_HANDLERS, 416 | "Exit reason 0x%lX rip 0x%lX not within range of handler table", 417 | reason, vcpu->guest_context.rip); 418 | 419 | struct vmexit_handler *exit_head = ctx->handlers[reason]; 420 | 421 | 422 | if (!exit_head) { 423 | dump_state(vcpu); 424 | /* Check to see if we actually have a handler for it. */ 425 | uint8_t *rip_bytes = (uint8_t *)vcpu->guest_context.rip; 426 | die_on(!exit_head, "vcpu=%d no exit reason handlers for 0x%lX at rip 0x%lX " 427 | "rip[0]=%02X rip[1]=%02X rip[2]=%02X rip[3]=%02X rip[4]=%02X rip[5]=%02X", 428 | vcpu->idx, reason, vcpu->guest_context.rip, 429 | rip_bytes[0], rip_bytes[1], rip_bytes[2], rip_bytes[3], rip_bytes[4], rip_bytes[5]); 430 | } 431 | 432 | /* Iterate from tail to head calling each, stop at override. */ 433 | struct vmexit_handler *curr_handler = exit_head->prev; 434 | while (true) { 435 | /* Call the callback for the VMEXIT. If this handler has the override 436 | * set, this means we SHOULDN'T call any others, so break. */ 437 | curr_handler->callback(vcpu, curr_handler->opaque, &move_to_next_instr); 438 | 439 | if (curr_handler->override) 440 | break; 441 | 442 | if (curr_handler == exit_head) 443 | break; 444 | 445 | curr_handler = curr_handler->prev; 446 | }; 447 | 448 | /* 449 | * If the exit handler indicated to increment RIP, do so. 450 | * We cannot use the guest_context.rip field to increment as 451 | * this will not be restored on re-enter to the guest, we need to 452 | * directly write to the VMCS field instead. 453 | */ 454 | if (move_to_next_instr) { 455 | size_t guest_rip = __vmread(VMCS_GUEST_RIP); 456 | guest_rip += __vmread(VMCS_EXIT_INSTR_LENGTH); 457 | __vmwrite(VMCS_GUEST_RIP, guest_rip); 458 | } 459 | 460 | handle_cached_interrupts(vcpu); 461 | } 462 | 463 | static void register_generic_handlers(struct handler_ctx *ctx) 464 | { 465 | static const vmexit_cbk_t GENERIC_HANDLERS[MAX_EXIT_HANDLERS] = { 466 | [VMX_EXIT_REASON_CPUID] = handle_cpuid, 467 | [VMX_EXIT_REASON_XSETBV] = handle_xsetbv, 468 | [VMX_EXIT_REASON_INVD] = handle_invd, 469 | [VMX_EXIT_REASON_INIT_SIGNAL] = handle_init_signal, 470 | [VMX_EXIT_REASON_SIPI] = handle_sipi, 471 | }; 472 | 473 | /* Register all of our generic handlers 474 | * This is done by iterating a key/value array of exit to internal handlers. */ 475 | for (int exit_reason = 0; exit_reason < MAX_EXIT_HANDLERS; exit_reason++) { 476 | vmexit_cbk_t cbk = GENERIC_HANDLERS[exit_reason]; 477 | 478 | if (cbk) { 479 | DEBUG_PRINT("Registering generic exit 0x%lX callback 0x%lX", exit_reason, cbk); 480 | handler_register_exit(ctx, exit_reason, cbk, NULL, false); 481 | } 482 | } 483 | } 484 | 485 | struct handler_ctx *handler_init(struct vmm_ctx *vmm) 486 | { 487 | struct handler_ctx *ctx = vmem_alloc(sizeof(struct handler_ctx), MEM_WRITE); 488 | die_on(!ctx, "Unable to allocate context for VMEXIT handlers."); 489 | vmm->handler = ctx; 490 | ctx->vmm = vmm; 491 | 492 | register_generic_handlers(ctx); 493 | return ctx; 494 | } 495 | 496 | void handler_register_exit(struct handler_ctx *ctx, 497 | size_t exit_reason, 498 | vmexit_cbk_t callback, 499 | void *opaque, 500 | bool override) 501 | { 502 | /* Ensure synchronization. */ 503 | spin_lock(&ctx->vmm->lock); 504 | 505 | die_on(exit_reason >= MAX_EXIT_HANDLERS, "Invalid exit handler index 0x%lX", exit_reason); 506 | 507 | /* Allocate a new vmexit handler. */ 508 | struct vmexit_handler *new_handler = vmem_alloc(sizeof(struct vmexit_handler), MEM_WRITE); 509 | die_on(!new_handler, "Unable to allocate memory for VMEXIT handler."); 510 | 511 | /* Fill out the information for the handler. */ 512 | new_handler->callback = callback; 513 | new_handler->opaque = opaque; 514 | new_handler->override = override; 515 | 516 | /* Now manipulate the linked list for vmexit entry. */ 517 | struct vmexit_handler **exit_base = &ctx->handlers[exit_reason]; 518 | struct vmexit_handler *exit_head = *exit_base; 519 | new_handler->next = NULL; 520 | if (exit_head == NULL) { 521 | /* No current exit handlers. */ 522 | new_handler->prev = new_handler; 523 | *exit_base = new_handler; 524 | } else { 525 | /* An event already exists. 526 | * Add our new handler to the tail of the list. */ 527 | struct vmexit_handler *exit_tail = exit_head->prev; 528 | die_on(override && exit_tail->override, 529 | "Cannot override if an override for exit 0x%X already set", 530 | exit_reason); 531 | 532 | new_handler->prev = exit_tail; 533 | exit_tail->next = new_handler; 534 | } 535 | 536 | DEBUG_PRINT("VMEXIT registered for 0x%lX cbk 0x%lX opaque 0x%lX override %d", 537 | exit_reason, callback, opaque, override); 538 | spin_unlock(&ctx->vmm->lock); 539 | } 540 | 541 | __attribute__((ms_abi)) void handler_guest_to_host(struct vcpu_context *guest_ctx) 542 | { 543 | /* 544 | * Because we had to use RCX in shim_guest_to_host as a parameter 545 | * for the __capture_context which gets passed to this handler 546 | * we have to retrieve this value and store it back in the guest_context. */ 547 | uint64_t restore_rsp = guest_ctx->rsp + sizeof(uint64_t); 548 | guest_ctx->rcx = *(uint64_t *)((uintptr_t)guest_ctx - sizeof(guest_ctx->rcx)); 549 | 550 | /* Set what the guest RIP and RSP were. */ 551 | guest_ctx->rsp = __vmread(VMCS_GUEST_RSP); 552 | guest_ctx->rip = __vmread(VMCS_GUEST_RIP); 553 | 554 | /* 555 | * Find the vcpu_ctx structure by backtracing from the guest_ctx which we can 556 | * assume was stored on the host_stack. 557 | */ 558 | struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx(); 559 | 560 | /* Indicate running as host and then copy the guest context from stack to vcpu struct. */ 561 | vcpu->guest_context = *guest_ctx; 562 | 563 | /* Handle the VMEXIT reason. */ 564 | handle_exit_reason(vcpu); 565 | 566 | /* 567 | * Trigger the return back into guest mode, by adjusting RIP in our stored 568 | * guest context and then adjust the context RIP to our VMRESUME handler. 569 | */ 570 | vcpu->guest_context.rsp = restore_rsp; 571 | vcpu->guest_context.rip = (uint64_t)__vmresume; 572 | __restore_context(&vcpu->guest_context); 573 | } -------------------------------------------------------------------------------- /hypervisor/vmm/vmm.c: -------------------------------------------------------------------------------- 1 | #define DEBUG_MODULE 2 | #include "platform/standard.h" 3 | #include "platform/intrin.h" 4 | #include "platform/util.h" 5 | #include "memory/pmem.h" 6 | #include "memory/mem.h" 7 | #include "vmm.h" 8 | #include "ept.h" 9 | #include "handler.h" 10 | #include "vmcall.h" 11 | #include "nested.h" 12 | #include "shim.h" 13 | #include "impl_hooks.h" 14 | #include "ia32_compact.h" 15 | 16 | /* The main VMM that will initialise the hypervisor, 17 | * currently this is aimed at only targetting x86_64 platforms. */ 18 | 19 | /* Mask used to ignore the ring level when specifying a selector. */ 20 | #define IGNORE_RPL_MASK (~3) 21 | 22 | /* Holds information on a GDT entry. */ 23 | struct gdt_entry { 24 | size_t base; 25 | uint32_t limit; 26 | uint16_t sel; 27 | vmx_segment_access_rights access; 28 | }; 29 | 30 | /* Global const offset used for calculating where the stack pointer 31 | * will be offsetted from the vcpu_ctx upon a guest_to_host or vice versa 32 | * transition. We have to use this as our shim assembly code uses it for 33 | * reclaiming the vcpu_ctx pointer upon hyperjacking. */ 34 | const size_t VMM_HYPERJACK_STACK_OFFSET = offsetof(struct vcpu_ctx, host_stack) + 35 | HOST_STACK_SIZE - sizeof(struct vcpu_context); 36 | 37 | static void probe_capabilities() 38 | { 39 | DEBUG_PRINT("Checking CPU capabilities."); 40 | 41 | cpuid_eax_01 version_info; 42 | int rc = CPUID_LEAF_READ(CPUID_VERSION_INFO, version_info); 43 | die_on(!rc, "Unable to query version information."); 44 | die_on(!version_info.ecx.virtual_machine_extensions, "No virtual machine extensions."); 45 | 46 | cpuid_eax_80000001 extend_cpu; 47 | rc = CPUID_LEAF_READ(CPUID_EXTENDED_CPU_SIGNATURE, extend_cpu); 48 | die_on(!rc, "Unable to read extended CPUID signature."); 49 | die_on(!extend_cpu.edx.pages_1gb_available, "No 1GB pages support."); 50 | 51 | ia32_feature_control_register feature_control; 52 | feature_control.flags = rdmsr(IA32_FEATURE_CONTROL); 53 | die_on(!feature_control.lock_bit, "Lock bit not set."); 54 | die_on(!feature_control.enable_vmx_outside_smx, "VMX not enabled outside SMX."); 55 | 56 | ia32_vmx_ept_vpid_cap_register ept_vpid; 57 | ept_vpid.flags = rdmsr(IA32_VMX_EPT_VPID_CAP); 58 | die_on(!ept_vpid.page_walk_length_4, "EPT PML4 not supported."); 59 | die_on(!ept_vpid.memory_type_write_back, "EPT memory type WB not supported."); 60 | die_on(!ept_vpid.pde_2mb_pages, "EPT 2MB pages not supported."); 61 | 62 | DEBUG_PRINT("CPU seems to provide all capabilities needed."); 63 | } 64 | 65 | static void print_gdt(char *prefix, segment_descriptor_register_64 *gdtr) 66 | { 67 | (void)prefix; 68 | (void)gdtr; 69 | 70 | #ifdef DEBUG_MODULE 71 | DEBUG_PRINT("--- %s GDT base 0x%lX limit 0x%lX", prefix, gdtr->base_address, gdtr->limit); 72 | 73 | segment_descriptor_32 *gdt = (segment_descriptor_32 *)gdtr->base_address; 74 | int desc_max = (gdtr->limit + 1ull) / sizeof(segment_descriptor_32); 75 | for (int i = 0; i < desc_max; i++) { 76 | segment_descriptor_32 *curr_desc = (segment_descriptor_32 *)&gdt[i]; 77 | 78 | uint32_t seg_lim = (curr_desc->segment_limit_high << 16) | curr_desc->segment_limit_low; 79 | uintptr_t base_addr = (curr_desc->base_address_high << 24) | 80 | (curr_desc->base_address_middle << 16) | 81 | (curr_desc->base_address_low & UINT16_MAX); 82 | 83 | DEBUG_PRINT("Descriptor 0x%lX\r\n" \ 84 | "------ Flags 0x%X\r\n" \ 85 | "------ Present 0x%X\r\n" \ 86 | "------ Type 0x%X\r\n" \ 87 | "------ Segment limit 0x%X\r\n" \ 88 | "------ Base address 0x%lX\r\n", 89 | (uintptr_t)curr_desc, 90 | curr_desc->flags, 91 | curr_desc->present, 92 | curr_desc->type, 93 | seg_lim, 94 | base_addr); 95 | } 96 | #endif /* DEBUG_VMM */ 97 | } 98 | 99 | static void configure_vcpu_gdt(struct gdt_config *gdt_cfg) 100 | { 101 | /* Everything within the GDT is in linear/physical addresses 102 | * rather than virtual, therefore we need to retrieve CR3 so 103 | * that we can do some conversions from virt to phys. */ 104 | cr3 this_cr3; 105 | this_cr3.flags = __readcr3(); 106 | 107 | /* Read the original GDTR and store it so we can use it for the guest later. */ 108 | __sgdt(&gdt_cfg->guest_gdtr); 109 | __sldt(&gdt_cfg->guest_ldtr); 110 | die_on(!gdt_cfg->guest_gdtr.base_address, "No base address set for guest GDTR"); 111 | die_on(!gdt_cfg->guest_gdtr.limit, "No limit set for guest GDTR"); 112 | 113 | /* For the host GDT we're going to copy the guest GDT and then append 114 | * a TSS to the GDT as this is required for VMX to be used, unfortunately 115 | * the UEFI environment doesn't set this up. */ 116 | memcpy(gdt_cfg->host_gdt, 117 | (const void *)gdt_cfg->guest_gdtr.base_address, 118 | gdt_cfg->guest_gdtr.limit); 119 | 120 | /* Configure the GDTR we're going to use for the host. */ 121 | uintptr_t host_gdt_phys = mem_va_to_pa(this_cr3, gdt_cfg->host_gdt); 122 | gdt_cfg->host_gdtr.base_address = host_gdt_phys; 123 | gdt_cfg->host_gdtr.limit = gdt_cfg->guest_gdtr.limit + sizeof(segment_descriptor_64); 124 | DEBUG_PRINT("Host GDTR base %lX limit %lX", 125 | gdt_cfg->host_gdtr.base_address, 126 | gdt_cfg->host_gdtr.limit); 127 | 128 | /* Append the TR to the end of the GDT. */ 129 | gdt_cfg->host_tr.flags = 0; 130 | gdt_cfg->host_tr.index = (gdt_cfg->guest_gdtr.limit + 1ull) / sizeof(segment_descriptor_32); 131 | DEBUG_PRINT("Host TR index %d", gdt_cfg->host_tr.index); 132 | 133 | uintptr_t tss_pa = mem_va_to_pa(this_cr3, &gdt_cfg->host_tss); 134 | segment_descriptor_64 tss_desc = { 0 }; 135 | tss_desc.segment_limit_low = sizeof(struct task_state_segment_64) - 1; 136 | tss_desc.base_address_low = tss_pa & UINT16_MAX; 137 | tss_desc.base_address_middle = (tss_pa >> 16) & UINT8_MAX; 138 | tss_desc.base_address_high = (tss_pa >> 24) & UINT8_MAX; 139 | tss_desc.base_address_upper = (tss_pa >> 32) & UINT32_MAX; 140 | tss_desc.type = SEGMENT_DESCRIPTOR_TYPE_TSS_AVAILABLE; 141 | tss_desc.present = true; 142 | 143 | /* Now write the newly created TSS to our host GDT. */ 144 | segment_descriptor_32 *gdt32 = (segment_descriptor_32 *)gdt_cfg->host_gdt; 145 | segment_descriptor_64 *tss_in_gdt = (segment_descriptor_64 *)&gdt32[gdt_cfg->host_tr.index]; 146 | *tss_in_gdt = tss_desc; 147 | 148 | print_gdt("Host", &gdt_cfg->host_gdtr); 149 | 150 | /* Write the new GDTR and TR. */ 151 | uintptr_t phys_gdtr = mem_va_to_pa(this_cr3, &gdt_cfg->host_gdtr); 152 | __lgdt((void*)phys_gdtr); 153 | __ltr(&gdt_cfg->host_tr); 154 | } 155 | 156 | static void capture_control_regs(struct control_registers *regs) 157 | { 158 | regs->reg_cr0.flags = __readcr0(); 159 | regs->reg_cr3.flags = __readcr3(); 160 | regs->reg_cr4.flags = __readcr4(); 161 | regs->debugctl.flags = rdmsr(IA32_DEBUGCTL); 162 | regs->gs_base = rdmsr(IA32_GS_BASE); 163 | regs->dr7 = __readdr7(); 164 | 165 | DEBUG_PRINT("--- cr0 %lX\r\n" \ 166 | "--- cr3 %lX\r\n" \ 167 | "--- cr4 %lX\r\n" \ 168 | "--- debugctl %lX\r\n" \ 169 | "--- gs_base %lX\r\n" \ 170 | "--- dr7 %lX", 171 | regs->reg_cr0.flags, regs->reg_cr3.flags, regs->reg_cr4.flags, 172 | regs->debugctl.flags, regs->gs_base, regs->dr7); 173 | } 174 | 175 | static void enter_root_mode(struct vcpu_ctx *vcpu) 176 | { 177 | /* Set up root VMXON and the guest VMCS. */ 178 | ia32_vmx_basic_register basic; 179 | basic.flags = rdmsr(IA32_VMX_BASIC); 180 | 181 | memset(&vcpu->host_vmxon, 0, sizeof(vcpu->host_vmxon)); 182 | memset(&vcpu->guest_vmcs, 0, sizeof(vcpu->guest_vmcs)); 183 | vcpu->host_vmxon.revision_id = basic.vmcs_revision_id; 184 | vcpu->guest_vmcs.revision_id = basic.vmcs_revision_id; 185 | 186 | /* Set the fixed requirements for the control registers for VMX. */ 187 | vcpu->guest_ctrl_regs.reg_cr0 = vmm_adjust_cr0(vcpu->guest_ctrl_regs.reg_cr0); 188 | vcpu->guest_ctrl_regs.reg_cr4 = vmm_adjust_cr4(vcpu->guest_ctrl_regs.reg_cr4); 189 | 190 | /* Update host CR0/4 with new updated fields. */ 191 | __writecr0(vcpu->guest_ctrl_regs.reg_cr0.flags); 192 | __writecr4(vcpu->guest_ctrl_regs.reg_cr4.flags); 193 | 194 | /* Calculate the physical addresses of vmxon and vmcs. */ 195 | cr3 this_cr3; 196 | this_cr3.flags = __readcr3(); 197 | void *phys_vmxon = (void *)mem_va_to_pa(this_cr3, &vcpu->host_vmxon); 198 | void *phys_vmcs = (void *)mem_va_to_pa(this_cr3, &vcpu->guest_vmcs); 199 | 200 | die_on(!__vmxon(&phys_vmxon), "Unable to enter VMX root mode."); 201 | die_on(!__vmclear(&phys_vmcs), "Unable to clear VMCS."); 202 | die_on(!__vmptrld(&phys_vmcs), "Unable to load the VMCS."); 203 | } 204 | 205 | static uint64_t encode_msr(uint64_t ctrl, uint64_t desired) 206 | { 207 | /* 208 | * VMX feature/capability MSRs encode the "must be 0" bits in the high word 209 | * of their value, and the "must be 1" bits in the low word of their value. 210 | * Adjust any requested capability/feature based on these requirements. 211 | */ 212 | desired &= (uint32_t)(ctrl >> 32); 213 | desired |= (uint32_t)ctrl; 214 | return desired; 215 | } 216 | 217 | static void gather_gdt_entry(segment_descriptor_register_64 *gdtr, uint16_t sel, 218 | struct gdt_entry *entry) 219 | { 220 | /* If the selector is not valid (0) set unusable entry. */ 221 | if (sel == 0) { 222 | entry->sel = 0; 223 | entry->limit = 0; 224 | entry->base = 0; 225 | entry->access.flags = 0; 226 | entry->access.unusable = true; 227 | return; 228 | } 229 | 230 | /* Calculate the descriptor pointer */ 231 | segment_descriptor_64 *descriptor = 232 | (segment_descriptor_64 *)(gdtr->base_address + (sel & IGNORE_RPL_MASK)); 233 | 234 | /* Fill in the entry information. */ 235 | entry->sel = sel; 236 | entry->limit = (descriptor->segment_limit_high << 16) | descriptor->segment_limit_low; 237 | entry->base = ((size_t)descriptor->base_address_high << 24) | 238 | ((size_t)descriptor->base_address_middle << 16) | 239 | ((size_t)descriptor->base_address_low); 240 | 241 | if (descriptor->descriptor_type == 0) { 242 | entry->base |= (size_t)descriptor->base_address_upper << 32; 243 | } 244 | 245 | /* Access rights are defines as the middle 16 bits of the descriptor flags section. */ 246 | entry->access.flags = (descriptor->flags >> 8) & 0xFFFF; 247 | entry->access.unusable = !descriptor->present; 248 | entry->access.reserved_1 = 0; 249 | } 250 | 251 | static void setup_vmcs_host(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu) 252 | { 253 | /* Configure the host context, as we're hyperjacking we want 254 | * to clone the original context as much as possible for ease 255 | * of use. */ 256 | struct vcpu_context *guest_ctx = &vcpu->hyperjack_context; 257 | 258 | /* Write all of the selectors, ignoring the RPL for each field 259 | * as the host environment will always be ring-0. */ 260 | __vmwrite(VMCS_HOST_CS_SEL, guest_ctx->seg_cs & IGNORE_RPL_MASK); 261 | __vmwrite(VMCS_HOST_SS_SEL, guest_ctx->seg_ss & IGNORE_RPL_MASK); 262 | __vmwrite(VMCS_HOST_DS_SEL, guest_ctx->seg_ds & IGNORE_RPL_MASK); 263 | __vmwrite(VMCS_HOST_ES_SEL, guest_ctx->seg_es & IGNORE_RPL_MASK); 264 | __vmwrite(VMCS_HOST_FS_SEL, guest_ctx->seg_fs & IGNORE_RPL_MASK); 265 | __vmwrite(VMCS_HOST_GS_SEL, guest_ctx->seg_gs & IGNORE_RPL_MASK); 266 | 267 | /* As in a UEFI environment TR is not set, therefore we use our own 268 | * generated one when we modified the GDT. */ 269 | __vmwrite(VMCS_HOST_TR_SEL, vcpu->gdt_cfg.host_tr.flags & IGNORE_RPL_MASK); 270 | 271 | /* Now write all of the BASE registers that are used for the host. */ 272 | __vmwrite(VMCS_HOST_GDTR_BASE, vcpu->gdt_cfg.host_gdtr.base_address); 273 | __vmwrite(VMCS_HOST_IDTR_BASE, vmm->init.host_idtr.base_address); 274 | 275 | /* 276 | * We (ab)use the GS_BASE field to store out vCPU context, so that 277 | * when we're in host context it's easy to retrieve which vCPU we are 278 | * via the GS_BASE field. 279 | */ 280 | __vmwrite(VMCS_HOST_GS_BASE, (uintptr_t)vcpu); 281 | 282 | /* Get the GDT information for FS & TR so we can write these for the host VMCS. */ 283 | struct gdt_entry entry; 284 | gather_gdt_entry(&vcpu->gdt_cfg.host_gdtr, guest_ctx->seg_fs, &entry); 285 | __vmwrite(VMCS_HOST_FS_BASE, entry.base); 286 | 287 | gather_gdt_entry(&vcpu->gdt_cfg.host_gdtr, vcpu->gdt_cfg.host_tr.flags, &entry); 288 | __vmwrite(VMCS_HOST_TR_BASE, entry.base); 289 | 290 | /* SYSENTRY fields. */ 291 | __vmwrite(VMCS_HOST_SYSENTER_ESP, rdmsr(IA32_SYSENTER_ESP)); 292 | __vmwrite(VMCS_HOST_SYSENTER_EIP, rdmsr(IA32_SYSENTER_EIP)); 293 | 294 | /* Control registers (using our own CR3 value for paging). */ 295 | __vmwrite(VMCS_HOST_CR0, vcpu->guest_ctrl_regs.reg_cr0.flags); 296 | __vmwrite(VMCS_HOST_CR3, vmm->init.host_cr3.flags); 297 | __vmwrite(VMCS_HOST_CR4, vcpu->guest_ctrl_regs.reg_cr4.flags); 298 | 299 | /* Extended feature enable registers. */ 300 | __vmwrite(VMCS_HOST_EFER, rdmsr(IA32_EFER)); 301 | 302 | /* 303 | * Load the hypervisor entrypoint and stack. We give ourselves a standard 304 | * size kernel stack (24KB) and bias for the context structure that the 305 | * hypervisor entrypoint will push on the stack, avoiding the need for RSP 306 | * modifying instructions in the entrypoint. Note that the CONTEXT pointer 307 | * and thus the stack itself, must be 16-byte aligned for ABI compatibility 308 | * with AMD64 -- specifically, XMM operations will fail otherwise, such as 309 | * the ones that __capture_context will perform. 310 | */ 311 | uintptr_t host_rip = (uintptr_t)shim_guest_to_host; 312 | uintptr_t host_rsp = (uintptr_t)vcpu + VMM_HYPERJACK_STACK_OFFSET; 313 | 314 | __vmwrite(VMCS_HOST_RSP, host_rsp); 315 | __vmwrite(VMCS_HOST_RIP, host_rip); 316 | DEBUG_PRINT("VMCS_HOST_RIP: 0x%lX VMCS_HOST_RSP: 0x%lX", host_rip, host_rsp); 317 | } 318 | 319 | __attribute__((noreturn)) static void vmm_hyperjack_handler(void) 320 | { 321 | /* 322 | * We are currently executing in the guest, after a successful 323 | * initial VMLAUNCH, so now we need to return to our hyperjacking 324 | * code path where the initial init_routine_per_vcpu::__capture_context 325 | * took place. 326 | * 327 | * This time with the launched flag set, therefore the driver 328 | * should then exit successfully. 329 | */ 330 | struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx(); 331 | 332 | vcpu->launched = true; 333 | __restore_context(&vcpu->hyperjack_context); 334 | die_on(true, "Shouldn't be here context should have been restored."); 335 | } 336 | 337 | static void setup_vmcs_guest(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu) 338 | { 339 | /* 340 | * Defines a generic structure so that we can iteratively write all segment 341 | * fields needed for the guest. 342 | */ 343 | struct gdt_config { 344 | uint16_t sel; 345 | uint32_t vmcs_sel; 346 | uint32_t vmcs_lim; 347 | uint32_t vmcs_ar; 348 | uint32_t vmcs_base; 349 | segment_descriptor_register_64 *gdtr; 350 | }; 351 | 352 | struct vcpu_context *guest_ctx = &vcpu->hyperjack_context; 353 | 354 | const struct gdt_config vmcs_gdt_list[] = { 355 | { 356 | .sel = guest_ctx->seg_cs, 357 | .vmcs_sel = VMCS_GUEST_CS_SEL, 358 | .vmcs_lim = VMCS_GUEST_CS_LIMIT, 359 | .vmcs_ar = VMCS_GUEST_CS_ACCESS_RIGHTS, 360 | .vmcs_base = VMCS_GUEST_CS_BASE, 361 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 362 | }, 363 | { 364 | .sel = guest_ctx->seg_ss, 365 | .vmcs_sel = VMCS_GUEST_SS_SEL, 366 | .vmcs_lim = VMCS_GUEST_SS_LIMIT, 367 | .vmcs_ar = VMCS_GUEST_SS_ACCESS_RIGHTS, 368 | .vmcs_base = VMCS_GUEST_SS_BASE, 369 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 370 | }, 371 | { 372 | .sel = guest_ctx->seg_ds, 373 | .vmcs_sel = VMCS_GUEST_DS_SEL, 374 | .vmcs_lim = VMCS_GUEST_DS_LIMIT, 375 | .vmcs_ar = VMCS_GUEST_DS_ACCESS_RIGHTS, 376 | .vmcs_base = VMCS_GUEST_DS_BASE, 377 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 378 | }, 379 | { 380 | .sel = guest_ctx->seg_es, 381 | .vmcs_sel = VMCS_GUEST_ES_SEL, 382 | .vmcs_lim = VMCS_GUEST_ES_LIMIT, 383 | .vmcs_ar = VMCS_GUEST_ES_ACCESS_RIGHTS, 384 | .vmcs_base = VMCS_GUEST_ES_BASE, 385 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 386 | }, 387 | { 388 | .sel = guest_ctx->seg_fs, 389 | .vmcs_sel = VMCS_GUEST_FS_SEL, 390 | .vmcs_lim = VMCS_GUEST_FS_LIMIT, 391 | .vmcs_ar = VMCS_GUEST_FS_ACCESS_RIGHTS, 392 | .vmcs_base = VMCS_GUEST_FS_BASE, 393 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 394 | }, 395 | { 396 | .sel = guest_ctx->seg_gs, 397 | .vmcs_sel = VMCS_GUEST_GS_SEL, 398 | .vmcs_lim = VMCS_GUEST_GS_LIMIT, 399 | .vmcs_ar = VMCS_GUEST_GS_ACCESS_RIGHTS, 400 | .vmcs_base = VMCS_GUEST_GS_BASE, 401 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 402 | }, 403 | { 404 | .sel = vcpu->gdt_cfg.host_tr.flags, 405 | .vmcs_sel = VMCS_GUEST_TR_SEL, 406 | .vmcs_lim = VMCS_GUEST_TR_LIMIT, 407 | .vmcs_ar = VMCS_GUEST_TR_ACCESS_RIGHTS, 408 | .vmcs_base = VMCS_GUEST_TR_BASE, 409 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 410 | }, 411 | { 412 | .sel = vcpu->gdt_cfg.guest_ldtr.flags, 413 | .vmcs_sel = VMCS_GUEST_LDTR_SEL, 414 | .vmcs_lim = VMCS_GUEST_LDTR_LIMIT, 415 | .vmcs_ar = VMCS_GUEST_LDTR_ACCESS_RIGHTS, 416 | .vmcs_base = VMCS_GUEST_LDTR_BASE, 417 | .gdtr = &vcpu->gdt_cfg.host_gdtr, 418 | } 419 | }; 420 | 421 | /* 422 | * For TR and LDTR we cannot use what the guest has, as to be able to 423 | * successfully VMENTER we need a TR and LDTR set (unfortunately) 424 | */ 425 | 426 | /* For each selector, generate it's entry and then fill in relevant fields. */ 427 | for (size_t i = 0; i < ARRAY_SIZE(vmcs_gdt_list); i++) { 428 | const struct gdt_config *curr_cfg = &vmcs_gdt_list[i]; 429 | struct gdt_entry entry; 430 | 431 | gather_gdt_entry(curr_cfg->gdtr, curr_cfg->sel, &entry); 432 | 433 | /* 434 | * This a shitty hack/workaround. 435 | * Some BIOS/UEFI environments will temporarily go into protected 436 | * mode on the AP's, if this is the case we need to ensure that 437 | * any operations that use the stack won't cause a #SS. Really we 438 | * should re-build the GDT entries upon re-entry to protected mode 439 | * but we can potentially do that in the future if we really need 440 | * to. 441 | */ 442 | if (curr_cfg->vmcs_sel == VMCS_GUEST_SS_SEL) 443 | entry.limit = UINT32_MAX; 444 | 445 | if (curr_cfg->vmcs_sel) 446 | __vmwrite(curr_cfg->vmcs_sel, entry.sel); 447 | if (curr_cfg->vmcs_lim) 448 | __vmwrite(curr_cfg->vmcs_lim, entry.limit); 449 | if (curr_cfg->vmcs_ar) 450 | __vmwrite(curr_cfg->vmcs_ar, entry.access.flags); 451 | if (curr_cfg->vmcs_base) 452 | __vmwrite(curr_cfg->vmcs_base, entry.base); 453 | 454 | DEBUG_PRINT("VMX GDT Entry: %d\r\n" \ 455 | "--- VMCS SEL [0x%lX]: 0x%lX\r\n" \ 456 | "--- VMCS LIM [0x%lX]: 0x%lX\r\n" \ 457 | "--- VMCS AR [0x%lX]: 0x%lX\r\n" \ 458 | "--- VMCS BASE [0x%lX]: 0x%lX\r\n", 459 | i, 460 | curr_cfg->vmcs_sel, entry.sel, 461 | curr_cfg->vmcs_lim, entry.limit, 462 | curr_cfg->vmcs_ar, entry.access.flags, 463 | curr_cfg->vmcs_base, entry.base); 464 | } 465 | 466 | /* Now write the GDTR for the guest (due to TR & LDTR restrictions re-use guest). */ 467 | __vmwrite(VMCS_GUEST_GDTR_BASE, vcpu->gdt_cfg.guest_gdtr.base_address); 468 | __vmwrite(VMCS_GUEST_GDTR_LIMIT, vcpu->gdt_cfg.guest_gdtr.limit); 469 | 470 | /* Now write IDTR, we can ACTUALLY use the guest IDT thank god... */ 471 | __vmwrite(VMCS_GUEST_IDTR_BASE, vmm->init.guest_idtr.base_address); 472 | __vmwrite(VMCS_GUEST_IDTR_LIMIT, vmm->init.guest_idtr.limit); 473 | 474 | /* Control registers. */ 475 | __vmwrite(VMCS_CTRL_CR0_READ_SHADOW, vcpu->guest_ctrl_regs.reg_cr0.flags); 476 | __vmwrite(VMCS_GUEST_CR0, vcpu->guest_ctrl_regs.reg_cr0.flags); 477 | 478 | __vmwrite(VMCS_CTRL_CR3_TARGET_COUNT, 0); 479 | __vmwrite(VMCS_GUEST_CR3, vmm->init.guest_cr3.flags); 480 | 481 | /* 482 | * If a bit is set in the CR4 guest/host mask, this means that 483 | * the value from the CR4 shadow will be utilised when in guest 484 | * mode/non-root mode. 485 | * 486 | * As such here, we indicate the VMXE bit in CR4 is set to be intercepted 487 | * and therefore we then indicate that VMXE is not indicated by clearing 488 | * the VMXE bit in the CR4 read shadow. 489 | */ 490 | 491 | __vmwrite(VMCS_CTRL_CR4_MASK, CR4_VMXE_MASK); 492 | __vmwrite(VMCS_CTRL_CR4_READ_SHADOW, vcpu->guest_ctrl_regs.reg_cr4.flags & ~CR4_VMXE_MASK); 493 | __vmwrite(VMCS_GUEST_CR4, vcpu->guest_ctrl_regs.reg_cr4.flags); 494 | 495 | /* Debug kernel registers. */ 496 | __vmwrite(VMCS_GUEST_DEBUGCTL, vcpu->guest_ctrl_regs.debugctl.flags); 497 | __vmwrite(VMCS_GUEST_DR7, vcpu->guest_ctrl_regs.dr7); 498 | 499 | /* Extended feature enable registers. */ 500 | __vmwrite(VMCS_GUEST_EFER, rdmsr(IA32_EFER)); 501 | 502 | /* 503 | * We (ab)use the GS_BASE field to store out vCPU context, we do this 504 | * for guest context so that we can retrieve it in our hyperjack handler. 505 | * 506 | * This will eventually get overwritten/nulled when the guest OS boots 507 | * after anyway. 508 | */ 509 | __vmwrite(VMCS_GUEST_GS_BASE, (uintptr_t)vcpu); 510 | 511 | /* 512 | * Finally, load the guest stack, instruction pointer, and rflags, which 513 | * corresponds exactly to the location where __capture_context will return 514 | * to inside of init_routine_per_vcpu. 515 | * 516 | * Use a dirty hack where we set the RSP to the kernel stack and the address 517 | * then set the first parameter on the stack to point to the vCPU context 518 | * so our host_to_guest shim can retrieve this. This MUST be accessible 519 | * within the guest CR3 (we can't use our vmem from host). 520 | * so we use the physical address as we should beidentity mapped. 521 | */ 522 | cr3 this_cr3; 523 | this_cr3.flags = __readcr3(); 524 | uintptr_t phys_vcpu_ctx = (uintptr_t)mem_va_to_pa(this_cr3, vcpu); 525 | 526 | uintptr_t guest_rip = (uintptr_t)vmm_hyperjack_handler; 527 | uintptr_t guest_rsp = (uintptr_t)phys_vcpu_ctx + VMM_HYPERJACK_STACK_OFFSET; 528 | 529 | __vmwrite(VMCS_GUEST_RFLAGS, guest_ctx->e_flags); 530 | __vmwrite(VMCS_GUEST_RSP, guest_rsp); 531 | __vmwrite(VMCS_GUEST_RIP, guest_rip); 532 | DEBUG_PRINT("VMCS_GUEST_RIP: 0x%lX VMCS_GUEST_RSP: 0x%lX PHYS_VCPU: 0x%lX", 533 | guest_rip, guest_rsp, phys_vcpu_ctx); 534 | } 535 | 536 | static void setup_vmcs_generic(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu) 537 | { 538 | /* Set up the link pointer. */ 539 | __vmwrite(VMCS_GUEST_VMCS_LINK_PTR, ~0ull); 540 | 541 | /* Set up the EPT fields. */ 542 | __vmwrite(VMCS_CTRL_EPTP, ept_get_pointer(vmm->ept)->flags); 543 | __vmwrite(VMCS_CTRL_VPID, 1); 544 | 545 | /* Load the MSR bitmap with the bitmap which will be used to 546 | * indicate which MSR reads/writes to trap on. 547 | * Setting all bits indicates trap on read & write. 548 | * 549 | * NOTE: MSR trapping for EVERY read/write is very intensive 550 | * trying to boot an actual OS with this is terrible. Instead 551 | * in the future maybe we can use this to target MSR read/write 552 | * when a specific CR3 is loaded/stored so we can do targetted 553 | * reading of drivers etc. */ 554 | cr3 this_cr3; 555 | this_cr3.flags = __readcr3(); 556 | 557 | memset(vcpu->msr_trap_bitmap, 0x00, PAGE_SIZE); 558 | __vmwrite(VMCS_CTRL_MSR_BITMAP, mem_va_to_pa(this_cr3, vcpu->msr_trap_bitmap)); 559 | 560 | /* We don't explicitly enable any pin-based options ourselves, but there may 561 | * be some required by the procesor, the encode the MSR to include these. */ 562 | uint32_t encoded = encode_msr(rdmsr(IA32_VMX_TRUE_PINBASED_CTLS), 0); 563 | __vmwrite(VMCS_CTRL_PIN_EXEC, encoded); 564 | 565 | /* 566 | * Enable support for RDTSCP and XSAVES/XRESTORES in the guest. Windows 10 567 | * makes use of both of these instructions if the CPU supports it. By using 568 | * adjustMSR, these options will be ignored if this processor does 569 | * not actually support the instructions to begin with. 570 | * 571 | * Also enable EPT support, for additional performance and ability to trap 572 | * memory access efficiently. 573 | */ 574 | ia32_vmx_procbased_ctls2_register proc_ctls2 = { 0 }; 575 | proc_ctls2.enable_rdtscp = true; 576 | proc_ctls2.enable_invpcid = true; 577 | proc_ctls2.enable_xsaves = true; 578 | proc_ctls2.unrestricted_guest = true; 579 | proc_ctls2.enable_ept = true; 580 | proc_ctls2.enable_vpid = true; 581 | encoded = encode_msr(rdmsr(IA32_VMX_PROCBASED_CTLS2), proc_ctls2.flags); 582 | __vmwrite(VMCS_CTRL_PROC_EXEC2, encoded); 583 | 584 | /* In order for the proc_ctls2 & MSR bitmap to be used we need to explicitly 585 | * enable them. */ 586 | ia32_vmx_procbased_ctls_register proc_ctls = { 0 }; 587 | proc_ctls.use_msr_bitmaps = true; 588 | proc_ctls.activate_secondary_controls = true; 589 | encoded = encode_msr(rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS), proc_ctls.flags); 590 | __vmwrite(VMCS_CTRL_PROC_EXEC, encoded); 591 | 592 | /* Make sure to exit in x64 mode at all times. */ 593 | ia32_vmx_exit_ctls_register exit_ctls = { 0 }; 594 | exit_ctls.save_debug_controls = true; 595 | exit_ctls.save_ia32_efer = true; 596 | exit_ctls.host_address_space_size = true; 597 | exit_ctls.load_ia32_efer = true; 598 | encoded = encode_msr(rdmsr(IA32_VMX_TRUE_EXIT_CTLS), exit_ctls.flags); 599 | __vmwrite(VMCS_CTRL_EXIT, encoded); 600 | 601 | /* Make sure when we re-enter it's back in x64 mode too. */ 602 | ia32_vmx_entry_ctls_register entry_ctls = { 0 }; 603 | entry_ctls.load_debug_controls = true; 604 | entry_ctls.load_ia32_efer = true; 605 | entry_ctls.ia32e_mode_guest = true; 606 | encoded = encode_msr(rdmsr(IA32_VMX_TRUE_ENTRY_CTLS), entry_ctls.flags); 607 | __vmwrite(VMCS_CTRL_ENTRY, encoded); 608 | } 609 | 610 | static void hook_init_root_mode(struct vcpu_ctx *vcpu) 611 | { 612 | /* 613 | * A hook into where we're running as ROOT/HOST mode 614 | * this can be used for initialising other application 615 | * specific logic as required. 616 | */ 617 | (void)vcpu; 618 | 619 | /* We CANNOT enable the memory hider at this point. 620 | * Otherwise when we do a VMLAUNCH and hyperjack back 621 | * into our driver we'll get a EPT violation. 622 | * 623 | * So, we should add a VMCALL routine to enable the 624 | * hiding. This unfortunately will have to be triggered 625 | * by a seperate module (plugin or UM process) as due 626 | * to the reason mentioned above, we cannot hide our own 627 | * memory when we need to execute from it in guest mode 628 | * still. */ 629 | } 630 | 631 | static void __attribute__((ms_abi)) init_routine_per_vcpu(void *opaque) 632 | { 633 | struct vmm_ctx *vmm = (struct vmm_ctx *)opaque; 634 | 635 | /* Ensure that the correct host CR3 is loaded for this vCPU. 636 | * This SHOULD be the case for vCPU 0 as they were originally set during 637 | * the initialisation of the modules, however for other vCPU's they 638 | * will be set to what they were before the hyperjack. */ 639 | __writecr3(vmm->init.host_cr3.flags); 640 | 641 | size_t proc_idx; 642 | die_on(!impl_get_processor_index(&proc_idx), "Unable to retrieve processor index."); 643 | die_on(proc_idx >= VCPU_MAX, "vCPU index greater than supported by VMM."); 644 | 645 | /* Create the vCPU context structure. 646 | * THIS MUST BE ALLOCATED AS CONTIGUOUS PHYSICAL MEMORY AS WHEN EXITING 647 | * DURING THE HOST TO GUEST HYPERJACKING SHIM WE DON'T WANT NON-CONTIGUOUS 648 | * PMEM WHICH WOULD CAUSE A POTENTIAL OVERWRITE OF WRONG PHYSICAL MEMORY. */ 649 | struct vcpu_ctx *vcpu = (struct vcpu_ctx *)pmem_alloc_contiguous(sizeof(struct vcpu_ctx)); 650 | die_on(!vcpu, "Unable to allocate vCPU %ld context.", proc_idx); 651 | 652 | /* Set the pointer so we can retrive VMM context from the vCPU context. */ 653 | vcpu->vmm = vmm; 654 | vcpu->idx = proc_idx; 655 | 656 | DEBUG_PRINT("Initialising vCPU %ld vmm ctx 0x%lX vcpu ctx 0x%lX.", proc_idx, vmm, vcpu); 657 | 658 | /* Configure the host GDT. */ 659 | configure_vcpu_gdt(&vcpu->gdt_cfg); 660 | 661 | /* Capturing control registers & context for the vCPU, 662 | * as to what the guest should be restored to once hyperjacked. */ 663 | capture_control_regs(&vcpu->guest_ctrl_regs); 664 | __capture_context(&vcpu->hyperjack_context); 665 | 666 | /* First pass (before hypervised) this shall be false as we 667 | * haven't hyperjacked yet. Upon restoration of the context 668 | * from within the guest (which will lead us up to just after __capture_context) 669 | * we need to do nothing and effectively "complete" loading of the driver. */ 670 | if (!vcpu->launched) { 671 | enter_root_mode(vcpu); 672 | 673 | /* Set up VMCS */ 674 | setup_vmcs_generic(vmm, vcpu); 675 | setup_vmcs_host(vmm, vcpu); 676 | setup_vmcs_guest(vmm, vcpu); 677 | 678 | #ifdef CONFIG_NESTED 679 | nested_init_vcpu(vcpu); 680 | #endif 681 | 682 | /* Hook for when running as ROOT mode but we have not 683 | * yet launched back into non-root/GUEST. */ 684 | hook_init_root_mode(vcpu); 685 | 686 | /* Attempt VMLAUNCH. */ 687 | DEBUG_PRINT("Attempting VMLAUNCH on vCPU %d with ctx: 0x%lX", proc_idx, vcpu); 688 | __vmlaunch(); 689 | 690 | /* 691 | * If we have got to this point, VMLAUNCH failed. 692 | * Get failure reason and dump info for debugging. */ 693 | size_t fail_reason = __vmread(VMCS_VM_INSTR_ERROR); 694 | debug_print("Failed to launch VMX with reason: 0x%lX", fail_reason); 695 | while (1) {}; 696 | } 697 | } 698 | 699 | static bool event_has_error_code(exception_vector vector) 700 | { 701 | switch (vector) { 702 | case double_fault: 703 | case invalid_tss: 704 | case segment_not_present: 705 | case stack_segment_fault: 706 | case general_protection: 707 | case page_fault: 708 | case alignment_check: 709 | return true; 710 | default: 711 | return false; 712 | } 713 | } 714 | 715 | void vmm_init(struct vmm_init_params *params) 716 | { 717 | /* Make sure the CPU supports all of the features required. */ 718 | probe_capabilities(); 719 | 720 | /* Static allocation of the global vCPU context. This has to be 721 | * static rather than dynamically allocated as the other vCPUs 722 | * that are started will not have the host CR3 set, hence they 723 | * will not have access to our dynamically allocated memory. */ 724 | static struct vmm_ctx vmm = { 0 }; 725 | memcpy(&vmm.init, params, sizeof(*params)); 726 | 727 | spin_init(&vmm.lock); 728 | vmm.ept = ept_init(); 729 | 730 | handler_init(&vmm); 731 | vmcall_init(&vmm); 732 | 733 | #ifdef CONFIG_NESTED 734 | nested_init(&vmm); 735 | #endif 736 | 737 | /* Run the initialisation routine on each LP. */ 738 | die_on(!impl_run_all_processors(init_routine_per_vcpu, &vmm), 739 | "Unable to run VMM init routine on each LP."); 740 | } 741 | 742 | void vmm_inject_guest_event(exception_vector vector, exception_error_code code) 743 | { 744 | vmentry_interrupt_info info = { 0 }; 745 | interruption_type type; 746 | 747 | /* Determine if the vector has an error code associated with it. */ 748 | info.deliver_error_code = event_has_error_code(vector); 749 | 750 | /* Determine the interrupt type. */ 751 | switch (vector) { 752 | case breakpoint: 753 | case overflow: 754 | type = software_exception; 755 | break; 756 | case debug: 757 | type = privileged_software_exception; 758 | break; 759 | default: 760 | type = hardware_exception; 761 | break; 762 | } 763 | 764 | /* Override if vector was greater than 0x20 */ 765 | if (vector >= 0x20) { 766 | type = external_interrupt; 767 | } 768 | 769 | info.vector = vector; 770 | info.interruption_type = type; 771 | info.valid = true; 772 | __vmwrite(VMCS_CTRL_ENTRY_INTERRUPTION_INFO, info.flags); 773 | 774 | if (info.deliver_error_code) 775 | __vmwrite(VMCS_CTRL_ENTRY_EXCEPTION_ERRCODE, code.flags); 776 | 777 | DEBUG_PRINT("Injected guest event 0x%lX type 0x%lX code 0x%lX", vector, type, code.flags); 778 | } -------------------------------------------------------------------------------- /examples/uefi/mp_service.h: -------------------------------------------------------------------------------- 1 | /** @file 2 | When installed, the MP Services Protocol produces a collection of services 3 | that are needed for MP management. 4 | 5 | The MP Services Protocol provides a generalized way of performing following tasks: 6 | - Retrieving information of multi-processor environment and MP-related status of 7 | specific processors. 8 | - Dispatching user-provided function to APs. 9 | - Maintain MP-related processor status. 10 | 11 | The MP Services Protocol must be produced on any system with more than one logical 12 | processor. 13 | 14 | The Protocol is available only during boot time. 15 | 16 | MP Services Protocol is hardware-independent. Most of the logic of this protocol 17 | is architecturally neutral. It abstracts the multi-processor environment and 18 | status of processors, and provides interfaces to retrieve information, maintain, 19 | and dispatch. 20 | 21 | MP Services Protocol may be consumed by ACPI module. The ACPI module may use this 22 | protocol to retrieve data that are needed for an MP platform and report them to OS. 23 | MP Services Protocol may also be used to program and configure processors, such 24 | as MTRR synchronization for memory space attributes setting in DXE Services. 25 | MP Services Protocol may be used by non-CPU DXE drivers to speed up platform boot 26 | by taking advantage of the processing capabilities of the APs, for example, using 27 | APs to help test system memory in parallel with other device initialization. 28 | Diagnostics applications may also use this protocol for multi-processor. 29 | 30 | Copyright (c) 2006 - 2017, Intel Corporation. All rights reserved.
31 | SPDX-License-Identifier: BSD-2-Clause-Patent 32 | 33 | @par Revision Reference: 34 | This Protocol is defined in the UEFI Platform Initialization Specification 1.2, 35 | Volume 2:Driver Execution Environment Core Interface. 36 | 37 | **/ 38 | 39 | #ifndef _MP_SERVICE_PROTOCOL_H_ 40 | #define _MP_SERVICE_PROTOCOL_H_ 41 | 42 | #include "efi.h" 43 | 44 | /* MODIFICATION: Added this manually from PI spec so I don't need to include other headers. */ 45 | typedef VOID (EFIAPI *EFI_AP_PROCEDURE) (IN VOID *ProcedureArgument); 46 | 47 | /// 48 | /// Global ID for the EFI_MP_SERVICES_PROTOCOL. 49 | /// 50 | #define EFI_MP_SERVICES_PROTOCOL_GUID \ 51 | { \ 52 | 0x3fdda605, 0xa76e, 0x4f46, {0xad, 0x29, 0x12, 0xf4, 0x53, 0x1b, 0x3d, 0x08} \ 53 | } 54 | 55 | /// 56 | /// Value used in the NumberProcessors parameter of the GetProcessorInfo function 57 | /// 58 | #define CPU_V2_EXTENDED_TOPOLOGY BIT24 59 | 60 | /// 61 | /// Forward declaration for the EFI_MP_SERVICES_PROTOCOL. 62 | /// 63 | typedef struct _EFI_MP_SERVICES_PROTOCOL EFI_MP_SERVICES_PROTOCOL; 64 | 65 | /// 66 | /// Terminator for a list of failed CPUs returned by StartAllAPs(). 67 | /// 68 | #define END_OF_CPU_LIST 0xffffffff 69 | 70 | /// 71 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and 72 | /// indicates whether the processor is playing the role of BSP. If the bit is 1, 73 | /// then the processor is BSP. Otherwise, it is AP. 74 | /// 75 | #define PROCESSOR_AS_BSP_BIT 0x00000001 76 | 77 | /// 78 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and 79 | /// indicates whether the processor is enabled. If the bit is 1, then the 80 | /// processor is enabled. Otherwise, it is disabled. 81 | /// 82 | #define PROCESSOR_ENABLED_BIT 0x00000002 83 | 84 | /// 85 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and 86 | /// indicates whether the processor is healthy. If the bit is 1, then the 87 | /// processor is healthy. Otherwise, some fault has been detected for the processor. 88 | /// 89 | #define PROCESSOR_HEALTH_STATUS_BIT 0x00000004 90 | 91 | /// 92 | /// Structure that describes the pyhiscal location of a logical CPU. 93 | /// 94 | typedef struct { 95 | /// 96 | /// Zero-based physical package number that identifies the cartridge of the processor. 97 | /// 98 | UINT32 Package; 99 | /// 100 | /// Zero-based physical core number within package of the processor. 101 | /// 102 | UINT32 Core; 103 | /// 104 | /// Zero-based logical thread number within core of the processor. 105 | /// 106 | UINT32 Thread; 107 | } EFI_CPU_PHYSICAL_LOCATION; 108 | 109 | /// 110 | /// Structure that defines the 6-level physical location of the processor 111 | /// 112 | typedef struct { 113 | /// 114 | /// Package Zero-based physical package number that identifies the cartridge of the processor. 115 | /// 116 | UINT32 Package; 117 | /// 118 | /// Module Zero-based physical module number within package of the processor. 119 | /// 120 | UINT32 Module; 121 | /// 122 | /// Tile Zero-based physical tile number within module of the processor. 123 | /// 124 | UINT32 Tile; 125 | /// 126 | /// Die Zero-based physical die number within tile of the processor. 127 | /// 128 | UINT32 Die; 129 | /// 130 | /// Core Zero-based physical core number within die of the processor. 131 | /// 132 | UINT32 Core; 133 | /// 134 | /// Thread Zero-based logical thread number within core of the processor. 135 | /// 136 | UINT32 Thread; 137 | } EFI_CPU_PHYSICAL_LOCATION2; 138 | 139 | typedef union { 140 | /// The 6-level physical location of the processor, including the 141 | /// physical package number that identifies the cartridge, the physical 142 | /// module number within package, the physical tile number within the module, 143 | /// the physical die number within the tile, the physical core number within 144 | /// package, and logical thread number within core. 145 | EFI_CPU_PHYSICAL_LOCATION2 Location2; 146 | } EXTENDED_PROCESSOR_INFORMATION; 147 | 148 | /// 149 | /// Structure that describes information about a logical CPU. 150 | /// 151 | typedef struct { 152 | /// 153 | /// The unique processor ID determined by system hardware. For IA32 and X64, 154 | /// the processor ID is the same as the Local APIC ID. Only the lower 8 bits 155 | /// are used, and higher bits are reserved. For IPF, the lower 16 bits contains 156 | /// id/eid, and higher bits are reserved. 157 | /// 158 | UINT64 ProcessorId; 159 | /// 160 | /// Flags indicating if the processor is BSP or AP, if the processor is enabled 161 | /// or disabled, and if the processor is healthy. Bits 3..31 are reserved and 162 | /// must be 0. 163 | /// 164 | ///
165 |   /// BSP  ENABLED  HEALTH  Description
166 |   /// ===  =======  ======  ===================================================
167 |   ///  0      0       0     Unhealthy Disabled AP.
168 |   ///  0      0       1     Healthy Disabled AP.
169 |   ///  0      1       0     Unhealthy Enabled AP.
170 |   ///  0      1       1     Healthy Enabled AP.
171 |   ///  1      0       0     Invalid. The BSP can never be in the disabled state.
172 |   ///  1      0       1     Invalid. The BSP can never be in the disabled state.
173 |   ///  1      1       0     Unhealthy Enabled BSP.
174 |   ///  1      1       1     Healthy Enabled BSP.
175 |   /// 
176 | /// 177 | UINT32 StatusFlag; 178 | /// 179 | /// The physical location of the processor, including the physical package number 180 | /// that identifies the cartridge, the physical core number within package, and 181 | /// logical thread number within core. 182 | /// 183 | EFI_CPU_PHYSICAL_LOCATION Location; 184 | /// 185 | /// The extended information of the processor. This field is filled only when 186 | /// CPU_V2_EXTENDED_TOPOLOGY is set in parameter ProcessorNumber. 187 | EXTENDED_PROCESSOR_INFORMATION ExtendedInformation; 188 | } EFI_PROCESSOR_INFORMATION; 189 | 190 | /** 191 | This service retrieves the number of logical processor in the platform 192 | and the number of those logical processors that are enabled on this boot. 193 | This service may only be called from the BSP. 194 | 195 | This function is used to retrieve the following information: 196 | - The number of logical processors that are present in the system. 197 | - The number of enabled logical processors in the system at the instant 198 | this call is made. 199 | 200 | Because MP Service Protocol provides services to enable and disable processors 201 | dynamically, the number of enabled logical processors may vary during the 202 | course of a boot session. 203 | 204 | If this service is called from an AP, then EFI_DEVICE_ERROR is returned. 205 | If NumberOfProcessors or NumberOfEnabledProcessors is NULL, then 206 | EFI_INVALID_PARAMETER is returned. Otherwise, the total number of processors 207 | is returned in NumberOfProcessors, the number of currently enabled processor 208 | is returned in NumberOfEnabledProcessors, and EFI_SUCCESS is returned. 209 | 210 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL 211 | instance. 212 | @param[out] NumberOfProcessors Pointer to the total number of logical 213 | processors in the system, including the BSP 214 | and disabled APs. 215 | @param[out] NumberOfEnabledProcessors Pointer to the number of enabled logical 216 | processors that exist in system, including 217 | the BSP. 218 | 219 | @retval EFI_SUCCESS The number of logical processors and enabled 220 | logical processors was retrieved. 221 | @retval EFI_DEVICE_ERROR The calling processor is an AP. 222 | @retval EFI_INVALID_PARAMETER NumberOfProcessors is NULL. 223 | @retval EFI_INVALID_PARAMETER NumberOfEnabledProcessors is NULL. 224 | 225 | **/ 226 | typedef 227 | EFI_STATUS 228 | (EFIAPI *EFI_MP_SERVICES_GET_NUMBER_OF_PROCESSORS)( 229 | IN EFI_MP_SERVICES_PROTOCOL *This, 230 | OUT UINTN *NumberOfProcessors, 231 | OUT UINTN *NumberOfEnabledProcessors 232 | ); 233 | 234 | /** 235 | Gets detailed MP-related information on the requested processor at the 236 | instant this call is made. This service may only be called from the BSP. 237 | 238 | This service retrieves detailed MP-related information about any processor 239 | on the platform. Note the following: 240 | - The processor information may change during the course of a boot session. 241 | - The information presented here is entirely MP related. 242 | 243 | Information regarding the number of caches and their sizes, frequency of operation, 244 | slot numbers is all considered platform-related information and is not provided 245 | by this service. 246 | 247 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL 248 | instance. 249 | @param[in] ProcessorNumber The handle number of processor. 250 | @param[out] ProcessorInfoBuffer A pointer to the buffer where information for 251 | the requested processor is deposited. 252 | 253 | @retval EFI_SUCCESS Processor information was returned. 254 | @retval EFI_DEVICE_ERROR The calling processor is an AP. 255 | @retval EFI_INVALID_PARAMETER ProcessorInfoBuffer is NULL. 256 | @retval EFI_NOT_FOUND The processor with the handle specified by 257 | ProcessorNumber does not exist in the platform. 258 | 259 | **/ 260 | typedef 261 | EFI_STATUS 262 | (EFIAPI *EFI_MP_SERVICES_GET_PROCESSOR_INFO)( 263 | IN EFI_MP_SERVICES_PROTOCOL *This, 264 | IN UINTN ProcessorNumber, 265 | OUT EFI_PROCESSOR_INFORMATION *ProcessorInfoBuffer 266 | ); 267 | 268 | /** 269 | This service executes a caller provided function on all enabled APs. APs can 270 | run either simultaneously or one at a time in sequence. This service supports 271 | both blocking and non-blocking requests. The non-blocking requests use EFI 272 | events so the BSP can detect when the APs have finished. This service may only 273 | be called from the BSP. 274 | 275 | This function is used to dispatch all the enabled APs to the function specified 276 | by Procedure. If any enabled AP is busy, then EFI_NOT_READY is returned 277 | immediately and Procedure is not started on any AP. 278 | 279 | If SingleThread is TRUE, all the enabled APs execute the function specified by 280 | Procedure one by one, in ascending order of processor handle number. Otherwise, 281 | all the enabled APs execute the function specified by Procedure simultaneously. 282 | 283 | If WaitEvent is NULL, execution is in blocking mode. The BSP waits until all 284 | APs finish or TimeoutInMicroSecs expires. Otherwise, execution is in non-blocking 285 | mode, and the BSP returns from this service without waiting for APs. If a 286 | non-blocking mode is requested after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT 287 | is signaled, then EFI_UNSUPPORTED must be returned. 288 | 289 | If the timeout specified by TimeoutInMicroseconds expires before all APs return 290 | from Procedure, then Procedure on the failed APs is terminated. All enabled APs 291 | are always available for further calls to EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() 292 | and EFI_MP_SERVICES_PROTOCOL.StartupThisAP(). If FailedCpuList is not NULL, its 293 | content points to the list of processor handle numbers in which Procedure was 294 | terminated. 295 | 296 | Note: It is the responsibility of the consumer of the EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() 297 | to make sure that the nature of the code that is executed on the BSP and the 298 | dispatched APs is well controlled. The MP Services Protocol does not guarantee 299 | that the Procedure function is MP-safe. Hence, the tasks that can be run in 300 | parallel are limited to certain independent tasks and well-controlled exclusive 301 | code. EFI services and protocols may not be called by APs unless otherwise 302 | specified. 303 | 304 | In blocking execution mode, BSP waits until all APs finish or 305 | TimeoutInMicroSeconds expires. 306 | 307 | In non-blocking execution mode, BSP is freed to return to the caller and then 308 | proceed to the next task without having to wait for APs. The following 309 | sequence needs to occur in a non-blocking execution mode: 310 | 311 | -# The caller that intends to use this MP Services Protocol in non-blocking 312 | mode creates WaitEvent by calling the EFI CreateEvent() service. The caller 313 | invokes EFI_MP_SERVICES_PROTOCOL.StartupAllAPs(). If the parameter WaitEvent 314 | is not NULL, then StartupAllAPs() executes in non-blocking mode. It requests 315 | the function specified by Procedure to be started on all the enabled APs, 316 | and releases the BSP to continue with other tasks. 317 | -# The caller can use the CheckEvent() and WaitForEvent() services to check 318 | the state of the WaitEvent created in step 1. 319 | -# When the APs complete their task or TimeoutInMicroSecondss expires, the MP 320 | Service signals WaitEvent by calling the EFI SignalEvent() function. If 321 | FailedCpuList is not NULL, its content is available when WaitEvent is 322 | signaled. If all APs returned from Procedure prior to the timeout, then 323 | FailedCpuList is set to NULL. If not all APs return from Procedure before 324 | the timeout, then FailedCpuList is filled in with the list of the failed 325 | APs. The buffer is allocated by MP Service Protocol using AllocatePool(). 326 | It is the caller's responsibility to free the buffer with FreePool() service. 327 | -# This invocation of SignalEvent() function informs the caller that invoked 328 | EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() that either all the APs completed 329 | the specified task or a timeout occurred. The contents of FailedCpuList 330 | can be examined to determine which APs did not complete the specified task 331 | prior to the timeout. 332 | 333 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL 334 | instance. 335 | @param[in] Procedure A pointer to the function to be run on 336 | enabled APs of the system. See type 337 | EFI_AP_PROCEDURE. 338 | @param[in] SingleThread If TRUE, then all the enabled APs execute 339 | the function specified by Procedure one by 340 | one, in ascending order of processor handle 341 | number. If FALSE, then all the enabled APs 342 | execute the function specified by Procedure 343 | simultaneously. 344 | @param[in] WaitEvent The event created by the caller with CreateEvent() 345 | service. If it is NULL, then execute in 346 | blocking mode. BSP waits until all APs finish 347 | or TimeoutInMicroSeconds expires. If it's 348 | not NULL, then execute in non-blocking mode. 349 | BSP requests the function specified by 350 | Procedure to be started on all the enabled 351 | APs, and go on executing immediately. If 352 | all return from Procedure, or TimeoutInMicroSeconds 353 | expires, this event is signaled. The BSP 354 | can use the CheckEvent() or WaitForEvent() 355 | services to check the state of event. Type 356 | EFI_EVENT is defined in CreateEvent() in 357 | the Unified Extensible Firmware Interface 358 | Specification. 359 | @param[in] TimeoutInMicrosecsond Indicates the time limit in microseconds for 360 | APs to return from Procedure, either for 361 | blocking or non-blocking mode. Zero means 362 | infinity. If the timeout expires before 363 | all APs return from Procedure, then Procedure 364 | on the failed APs is terminated. All enabled 365 | APs are available for next function assigned 366 | by EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() 367 | or EFI_MP_SERVICES_PROTOCOL.StartupThisAP(). 368 | If the timeout expires in blocking mode, 369 | BSP returns EFI_TIMEOUT. If the timeout 370 | expires in non-blocking mode, WaitEvent 371 | is signaled with SignalEvent(). 372 | @param[in] ProcedureArgument The parameter passed into Procedure for 373 | all APs. 374 | @param[out] FailedCpuList If NULL, this parameter is ignored. Otherwise, 375 | if all APs finish successfully, then its 376 | content is set to NULL. If not all APs 377 | finish before timeout expires, then its 378 | content is set to address of the buffer 379 | holding handle numbers of the failed APs. 380 | The buffer is allocated by MP Service Protocol, 381 | and it's the caller's responsibility to 382 | free the buffer with FreePool() service. 383 | In blocking mode, it is ready for consumption 384 | when the call returns. In non-blocking mode, 385 | it is ready when WaitEvent is signaled. The 386 | list of failed CPU is terminated by 387 | END_OF_CPU_LIST. 388 | 389 | @retval EFI_SUCCESS In blocking mode, all APs have finished before 390 | the timeout expired. 391 | @retval EFI_SUCCESS In non-blocking mode, function has been dispatched 392 | to all enabled APs. 393 | @retval EFI_UNSUPPORTED A non-blocking mode request was made after the 394 | UEFI event EFI_EVENT_GROUP_READY_TO_BOOT was 395 | signaled. 396 | @retval EFI_DEVICE_ERROR Caller processor is AP. 397 | @retval EFI_NOT_STARTED No enabled APs exist in the system. 398 | @retval EFI_NOT_READY Any enabled APs are busy. 399 | @retval EFI_TIMEOUT In blocking mode, the timeout expired before 400 | all enabled APs have finished. 401 | @retval EFI_INVALID_PARAMETER Procedure is NULL. 402 | 403 | **/ 404 | typedef 405 | EFI_STATUS 406 | (EFIAPI *EFI_MP_SERVICES_STARTUP_ALL_APS)( 407 | IN EFI_MP_SERVICES_PROTOCOL *This, 408 | IN EFI_AP_PROCEDURE Procedure, 409 | IN BOOLEAN SingleThread, 410 | IN EFI_EVENT WaitEvent OPTIONAL, 411 | IN UINTN TimeoutInMicroSeconds, 412 | IN VOID *ProcedureArgument OPTIONAL, 413 | OUT UINTN **FailedCpuList OPTIONAL 414 | ); 415 | 416 | /** 417 | This service lets the caller get one enabled AP to execute a caller-provided 418 | function. The caller can request the BSP to either wait for the completion 419 | of the AP or just proceed with the next task by using the EFI event mechanism. 420 | See EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() for more details on non-blocking 421 | execution support. This service may only be called from the BSP. 422 | 423 | This function is used to dispatch one enabled AP to the function specified by 424 | Procedure passing in the argument specified by ProcedureArgument. If WaitEvent 425 | is NULL, execution is in blocking mode. The BSP waits until the AP finishes or 426 | TimeoutInMicroSecondss expires. Otherwise, execution is in non-blocking mode. 427 | BSP proceeds to the next task without waiting for the AP. If a non-blocking mode 428 | is requested after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, 429 | then EFI_UNSUPPORTED must be returned. 430 | 431 | If the timeout specified by TimeoutInMicroseconds expires before the AP returns 432 | from Procedure, then execution of Procedure by the AP is terminated. The AP is 433 | available for subsequent calls to EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() and 434 | EFI_MP_SERVICES_PROTOCOL.StartupThisAP(). 435 | 436 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL 437 | instance. 438 | @param[in] Procedure A pointer to the function to be run on the 439 | designated AP of the system. See type 440 | EFI_AP_PROCEDURE. 441 | @param[in] ProcessorNumber The handle number of the AP. The range is 442 | from 0 to the total number of logical 443 | processors minus 1. The total number of 444 | logical processors can be retrieved by 445 | EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). 446 | @param[in] WaitEvent The event created by the caller with CreateEvent() 447 | service. If it is NULL, then execute in 448 | blocking mode. BSP waits until this AP finish 449 | or TimeoutInMicroSeconds expires. If it's 450 | not NULL, then execute in non-blocking mode. 451 | BSP requests the function specified by 452 | Procedure to be started on this AP, 453 | and go on executing immediately. If this AP 454 | return from Procedure or TimeoutInMicroSeconds 455 | expires, this event is signaled. The BSP 456 | can use the CheckEvent() or WaitForEvent() 457 | services to check the state of event. Type 458 | EFI_EVENT is defined in CreateEvent() in 459 | the Unified Extensible Firmware Interface 460 | Specification. 461 | @param[in] TimeoutInMicrosecsond Indicates the time limit in microseconds for 462 | this AP to finish this Procedure, either for 463 | blocking or non-blocking mode. Zero means 464 | infinity. If the timeout expires before 465 | this AP returns from Procedure, then Procedure 466 | on the AP is terminated. The 467 | AP is available for next function assigned 468 | by EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() 469 | or EFI_MP_SERVICES_PROTOCOL.StartupThisAP(). 470 | If the timeout expires in blocking mode, 471 | BSP returns EFI_TIMEOUT. If the timeout 472 | expires in non-blocking mode, WaitEvent 473 | is signaled with SignalEvent(). 474 | @param[in] ProcedureArgument The parameter passed into Procedure on the 475 | specified AP. 476 | @param[out] Finished If NULL, this parameter is ignored. In 477 | blocking mode, this parameter is ignored. 478 | In non-blocking mode, if AP returns from 479 | Procedure before the timeout expires, its 480 | content is set to TRUE. Otherwise, the 481 | value is set to FALSE. The caller can 482 | determine if the AP returned from Procedure 483 | by evaluating this value. 484 | 485 | @retval EFI_SUCCESS In blocking mode, specified AP finished before 486 | the timeout expires. 487 | @retval EFI_SUCCESS In non-blocking mode, the function has been 488 | dispatched to specified AP. 489 | @retval EFI_UNSUPPORTED A non-blocking mode request was made after the 490 | UEFI event EFI_EVENT_GROUP_READY_TO_BOOT was 491 | signaled. 492 | @retval EFI_DEVICE_ERROR The calling processor is an AP. 493 | @retval EFI_TIMEOUT In blocking mode, the timeout expired before 494 | the specified AP has finished. 495 | @retval EFI_NOT_READY The specified AP is busy. 496 | @retval EFI_NOT_FOUND The processor with the handle specified by 497 | ProcessorNumber does not exist. 498 | @retval EFI_INVALID_PARAMETER ProcessorNumber specifies the BSP or disabled AP. 499 | @retval EFI_INVALID_PARAMETER Procedure is NULL. 500 | 501 | **/ 502 | typedef 503 | EFI_STATUS 504 | (EFIAPI *EFI_MP_SERVICES_STARTUP_THIS_AP)( 505 | IN EFI_MP_SERVICES_PROTOCOL *This, 506 | IN EFI_AP_PROCEDURE Procedure, 507 | IN UINTN ProcessorNumber, 508 | IN EFI_EVENT WaitEvent OPTIONAL, 509 | IN UINTN TimeoutInMicroseconds, 510 | IN VOID *ProcedureArgument OPTIONAL, 511 | OUT BOOLEAN *Finished OPTIONAL 512 | ); 513 | 514 | /** 515 | This service switches the requested AP to be the BSP from that point onward. 516 | This service changes the BSP for all purposes. This call can only be performed 517 | by the current BSP. 518 | 519 | This service switches the requested AP to be the BSP from that point onward. 520 | This service changes the BSP for all purposes. The new BSP can take over the 521 | execution of the old BSP and continue seamlessly from where the old one left 522 | off. This service may not be supported after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT 523 | is signaled. 524 | 525 | If the BSP cannot be switched prior to the return from this service, then 526 | EFI_UNSUPPORTED must be returned. 527 | 528 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL instance. 529 | @param[in] ProcessorNumber The handle number of AP that is to become the new 530 | BSP. The range is from 0 to the total number of 531 | logical processors minus 1. The total number of 532 | logical processors can be retrieved by 533 | EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). 534 | @param[in] EnableOldBSP If TRUE, then the old BSP will be listed as an 535 | enabled AP. Otherwise, it will be disabled. 536 | 537 | @retval EFI_SUCCESS BSP successfully switched. 538 | @retval EFI_UNSUPPORTED Switching the BSP cannot be completed prior to 539 | this service returning. 540 | @retval EFI_UNSUPPORTED Switching the BSP is not supported. 541 | @retval EFI_DEVICE_ERROR The calling processor is an AP. 542 | @retval EFI_NOT_FOUND The processor with the handle specified by 543 | ProcessorNumber does not exist. 544 | @retval EFI_INVALID_PARAMETER ProcessorNumber specifies the current BSP or 545 | a disabled AP. 546 | @retval EFI_NOT_READY The specified AP is busy. 547 | 548 | **/ 549 | typedef 550 | EFI_STATUS 551 | (EFIAPI *EFI_MP_SERVICES_SWITCH_BSP)( 552 | IN EFI_MP_SERVICES_PROTOCOL *This, 553 | IN UINTN ProcessorNumber, 554 | IN BOOLEAN EnableOldBSP 555 | ); 556 | 557 | /** 558 | This service lets the caller enable or disable an AP from this point onward. 559 | This service may only be called from the BSP. 560 | 561 | This service allows the caller enable or disable an AP from this point onward. 562 | The caller can optionally specify the health status of the AP by Health. If 563 | an AP is being disabled, then the state of the disabled AP is implementation 564 | dependent. If an AP is enabled, then the implementation must guarantee that a 565 | complete initialization sequence is performed on the AP, so the AP is in a state 566 | that is compatible with an MP operating system. This service may not be supported 567 | after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT is signaled. 568 | 569 | If the enable or disable AP operation cannot be completed prior to the return 570 | from this service, then EFI_UNSUPPORTED must be returned. 571 | 572 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL instance. 573 | @param[in] ProcessorNumber The handle number of AP. 574 | The range is from 0 to the total number of 575 | logical processors minus 1. The total number of 576 | logical processors can be retrieved by 577 | EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). 578 | @param[in] EnableAP Specifies the new state for the processor for 579 | enabled, FALSE for disabled. 580 | @param[in] HealthFlag If not NULL, a pointer to a value that specifies 581 | the new health status of the AP. This flag 582 | corresponds to StatusFlag defined in 583 | EFI_MP_SERVICES_PROTOCOL.GetProcessorInfo(). Only 584 | the PROCESSOR_HEALTH_STATUS_BIT is used. All other 585 | bits are ignored. If it is NULL, this parameter 586 | is ignored. 587 | 588 | @retval EFI_SUCCESS The specified AP was enabled or disabled successfully. 589 | @retval EFI_UNSUPPORTED Enabling or disabling an AP cannot be completed 590 | prior to this service returning. 591 | @retval EFI_UNSUPPORTED Enabling or disabling an AP is not supported. 592 | @retval EFI_DEVICE_ERROR The calling processor is an AP. 593 | @retval EFI_NOT_FOUND Processor with the handle specified by ProcessorNumber 594 | does not exist. 595 | @retval EFI_INVALID_PARAMETER ProcessorNumber specifies the BSP. 596 | 597 | **/ 598 | typedef 599 | EFI_STATUS 600 | (EFIAPI *EFI_MP_SERVICES_ENABLEDISABLEAP)( 601 | IN EFI_MP_SERVICES_PROTOCOL *This, 602 | IN UINTN ProcessorNumber, 603 | IN BOOLEAN EnableAP, 604 | IN UINT32 *HealthFlag OPTIONAL 605 | ); 606 | 607 | /** 608 | This return the handle number for the calling processor. This service may be 609 | called from the BSP and APs. 610 | 611 | This service returns the processor handle number for the calling processor. 612 | The returned value is in the range from 0 to the total number of logical 613 | processors minus 1. The total number of logical processors can be retrieved 614 | with EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). This service may be 615 | called from the BSP and APs. If ProcessorNumber is NULL, then EFI_INVALID_PARAMETER 616 | is returned. Otherwise, the current processors handle number is returned in 617 | ProcessorNumber, and EFI_SUCCESS is returned. 618 | 619 | @param[in] This A pointer to the EFI_MP_SERVICES_PROTOCOL instance. 620 | @param[in] ProcessorNumber Pointer to the handle number of AP. 621 | The range is from 0 to the total number of 622 | logical processors minus 1. The total number of 623 | logical processors can be retrieved by 624 | EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). 625 | 626 | @retval EFI_SUCCESS The current processor handle number was returned 627 | in ProcessorNumber. 628 | @retval EFI_INVALID_PARAMETER ProcessorNumber is NULL. 629 | 630 | **/ 631 | typedef 632 | EFI_STATUS 633 | (EFIAPI *EFI_MP_SERVICES_WHOAMI)( 634 | IN EFI_MP_SERVICES_PROTOCOL *This, 635 | OUT UINTN *ProcessorNumber 636 | ); 637 | 638 | /// 639 | /// When installed, the MP Services Protocol produces a collection of services 640 | /// that are needed for MP management. 641 | /// 642 | /// Before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, the module 643 | /// that produces this protocol is required to place all APs into an idle state 644 | /// whenever the APs are disabled or the APs are not executing code as requested 645 | /// through the StartupAllAPs() or StartupThisAP() services. The idle state of 646 | /// an AP before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled is 647 | /// implementation dependent. 648 | /// 649 | /// After the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, all the APs 650 | /// must be placed in the OS compatible CPU state as defined by the UEFI 651 | /// Specification. Implementations of this protocol may use the UEFI event 652 | /// EFI_EVENT_GROUP_READY_TO_BOOT to force APs into the OS compatible state as 653 | /// defined by the UEFI Specification. Modules that use this protocol must 654 | /// guarantee that all non-blocking mode requests on all APs have been completed 655 | /// before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled. Since the 656 | /// order that event notification functions in the same event group are executed 657 | /// is not deterministic, an event of type EFI_EVENT_GROUP_READY_TO_BOOT cannot 658 | /// be used to guarantee that APs have completed their non-blocking mode requests. 659 | /// 660 | /// When the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, the StartAllAPs() 661 | /// and StartupThisAp() services must no longer support non-blocking mode requests. 662 | /// The support for SwitchBSP() and EnableDisableAP() may no longer be supported 663 | /// after this event is signaled. Since UEFI Applications and UEFI OS Loaders 664 | /// execute after the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, these 665 | /// UEFI images must be aware that the functionality of this protocol may be reduced. 666 | /// 667 | struct _EFI_MP_SERVICES_PROTOCOL { 668 | EFI_MP_SERVICES_GET_NUMBER_OF_PROCESSORS GetNumberOfProcessors; 669 | EFI_MP_SERVICES_GET_PROCESSOR_INFO GetProcessorInfo; 670 | EFI_MP_SERVICES_STARTUP_ALL_APS StartupAllAPs; 671 | EFI_MP_SERVICES_STARTUP_THIS_AP StartupThisAP; 672 | EFI_MP_SERVICES_SWITCH_BSP SwitchBSP; 673 | EFI_MP_SERVICES_ENABLEDISABLEAP EnableDisableAP; 674 | EFI_MP_SERVICES_WHOAMI WhoAmI; 675 | }; 676 | 677 | extern EFI_GUID gEfiMpServiceProtocolGuid; 678 | 679 | #endif 680 | --------------------------------------------------------------------------------