├── examples
    ├── Makefile
    └── uefi
    │   ├── elf_x86_64_efi.lds
    │   ├── Makefile
    │   ├── main.c
    │   └── mp_service.h
├── scripts
    ├── commands-gdb
    ├── run-qemu.sh
    ├── run-qemu-win10.sh
    ├── gdb-run-qemu-win10.sh
    └── run-qemu-hvci-win10.sh
├── interface
    ├── hypervisor
    │   ├── hypervisor.h
    │   ├── handler_if.h
    │   └── vmcall_if.h
    └── usermode
    │   ├── error.hpp
    │   ├── hypervisor.hpp
    │   └── hypervisor.cpp
├── hypervisor
    ├── platform
    │   ├── serial.h
    │   ├── util.h
    │   ├── spinlock.h
    │   ├── serial.c
    │   ├── arch.h
    │   ├── intrin.h
    │   ├── standard.h
    │   ├── nt.h
    │   └── intrin.asm
    ├── vmm
    │   ├── vmm.h
    │   ├── vmcall.h
    │   ├── shim.h
    │   ├── nested.h
    │   ├── handler.h
    │   ├── ept.h
    │   ├── shim.asm
    │   ├── vmm_reg.h
    │   ├── vmcall.c
    │   ├── vmm_common.h
    │   ├── nested.c
    │   ├── ept.c
    │   ├── handler.c
    │   └── vmm.c
    ├── interrupt
    │   ├── idt.h
    │   ├── idt.asm
    │   └── idt.c
    ├── memory
    │   ├── pmem.h
    │   ├── vmem.h
    │   ├── mem.h
    │   ├── mem.c
    │   ├── pmem.c
    │   └── vmem.c
    ├── impl_hooks.h
    ├── Makefile
    └── hypervisor.c
├── .gitmodules
├── Makefile
├── .gitignore
└── README.md


/examples/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all
2 | all:
3 | 	$(MAKE) -C uefi


--------------------------------------------------------------------------------
/scripts/commands-gdb:
--------------------------------------------------------------------------------
1 | target remote :1234
2 | layout asm
3 | set disassembly-flavor intel
4 | 


--------------------------------------------------------------------------------
/interface/hypervisor/hypervisor.h:
--------------------------------------------------------------------------------
1 | #ifndef HYPERVISOR_H
2 | #define HYPERVISOR_H
3 | 
4 | void hypervisor_init(void);
5 | 
6 | #endif /* HYPERVISOR_H */


--------------------------------------------------------------------------------
/hypervisor/platform/serial.h:
--------------------------------------------------------------------------------
1 | #ifndef SERIAL_H
2 | #define SERIAL_H
3 | 
4 | void serial_init(void);
5 | void serial_print(char *str);
6 | 
7 | #endif /* SERIAL_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/vmm.h:
--------------------------------------------------------------------------------
1 | #ifndef VMM_H
2 | #define VMM_H
3 | 
4 | #include "vmm_common.h"
5 | 
6 | void vmm_init(struct vmm_init_params *params);
7 | 
8 | #endif /* VMM_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/vmcall.h:
--------------------------------------------------------------------------------
1 | #ifndef VMCALL_H
2 | #define VMCALL_H
3 | 
4 | #include "platform/standard.h"
5 | #include "vmm_common.h"
6 | 
7 | struct vmcall_ctx *vmcall_init(struct vmm_ctx *vmm);
8 | 
9 | #endif /* VMCALL_H */


--------------------------------------------------------------------------------
/hypervisor/interrupt/idt.h:
--------------------------------------------------------------------------------
1 | #ifndef IDT_H
2 | #define IDT_H
3 | 
4 | #include "ia32_compact.h"
5 | 
6 | void idt_init(segment_descriptor_register_64 *orig_idtr, segment_descriptor_register_64 *new_idtr);
7 | 
8 | #endif /* IDT_H */


--------------------------------------------------------------------------------
/scripts/run-qemu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # To exit do Ctrl-A X
4 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/bin/qemu-system-x86_64 build/uefi.efi -- -nographic -enable-kvm -cpu host -smp 1


--------------------------------------------------------------------------------
/hypervisor/vmm/shim.h:
--------------------------------------------------------------------------------
1 | #ifndef SHIM_H
2 | #define SHIM_H
3 | 
4 | #include <stdint.h>
5 | 
6 | extern __attribute__((ms_abi)) void shim_guest_to_host(void);
7 | extern __attribute__((ms_abi)) void shim_host_to_guest(void);
8 | 
9 | #endif /* SHIM_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/nested.h:
--------------------------------------------------------------------------------
 1 | #ifndef NESTED_H
 2 | #define NESTED_H
 3 | 
 4 | #include "platform/standard.h"
 5 | #include "vmm_common.h"
 6 | 
 7 | void nested_init(struct vmm_ctx *vmm);
 8 | void nested_init_vcpu(struct vcpu_ctx *vcpu);
 9 | 
10 | #endif /* NESTED_H */


--------------------------------------------------------------------------------
/hypervisor/memory/pmem.h:
--------------------------------------------------------------------------------
 1 | #ifndef PMEM_H
 2 | #define PMEM_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | void pmem_init(void);
 8 | uintptr_t pmem_alloc_page(void);
 9 | uintptr_t pmem_alloc_contiguous(size_t bytes);
10 | void pmem_free_page(uintptr_t page);
11 | 
12 | #endif /* PMEM_H */


--------------------------------------------------------------------------------
/interface/usermode/error.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ERROR_H
 2 | #define ERROR_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | #define user_die_on(cond, ...) do { \
 7 |         if (cond) { \
 8 |             printf(__VA_ARGS__); \
 9 |             while (1) {} \
10 |         } \
11 |     } while (0)
12 | 
13 | #endif /* ERROR_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/handler.h:
--------------------------------------------------------------------------------
 1 | #ifndef HANDLER_H
 2 | #define HANDLER_H
 3 | 
 4 | #include "platform/standard.h"
 5 | #include "vmm_common.h"
 6 | #include "handler_if.h"
 7 | 
 8 | struct handler_ctx *handler_init(struct vmm_ctx *vmm);
 9 | __attribute__((ms_abi)) void handler_guest_to_host(struct vcpu_context *guest_ctx);
10 | 
11 | #endif /* HANDLER_H */


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "ia32-doc"]
 2 | 	path = submodules/ia32-doc
 3 | 	url = https://github.com/wbenny/ia32-doc.git
 4 | [submodule "uefi-run"]
 5 | 	path = submodules/uefi-run
 6 | 	url = git@github.com:POPFD/uefi-run.git
 7 | [submodule "hypervisor/platform/printf"]
 8 | 	path = hypervisor/platform/printf
 9 | 	url = https://github.com/mpaland/printf.git
10 | 


--------------------------------------------------------------------------------
/hypervisor/vmm/ept.h:
--------------------------------------------------------------------------------
 1 | #ifndef EPT_H
 2 | #define EPT_H
 3 | 
 4 | #include "ia32_compact.h"
 5 | 
 6 | struct ept_ctx *ept_init(void);
 7 | eptp *ept_get_pointer(struct ept_ctx *ctx);
 8 | ept_pde_2mb *get_ept_pml2e(struct ept_ctx *ctx, uintptr_t phys_addr);
 9 | ept_pte *ept_get_pml1e(struct ept_ctx *ctx, uintptr_t phys_addr);
10 | void ept_invalidate_and_flush(struct ept_ctx *ctx);
11 | 
12 | #endif /* EPT_H */


--------------------------------------------------------------------------------
/hypervisor/memory/vmem.h:
--------------------------------------------------------------------------------
 1 | #ifndef VMEM_H
 2 | #define VMEM_H
 3 | 
 4 | #include "platform/standard.h"
 5 | #include "ia32_compact.h"
 6 | 
 7 | #define MEM_READ_ONLY   (0)
 8 | #define MEM_WRITE       (1 << 0)
 9 | #define MEM_EXECUTE     (1 << 1)
10 | 
11 | void vmem_init(cr3 *original_cr3, cr3 *new_cr3);
12 | void *vmem_alloc(size_t size, unsigned int flags);
13 | void vmem_change_perms(void *addr, size_t size, unsigned int flags);
14 | 
15 | #endif /* VMEM_H */


--------------------------------------------------------------------------------
/hypervisor/platform/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_H
 2 | #define UTIL_H
 3 | 
 4 | #include "standard.h"
 5 | 
 6 | static inline void bitmap_clear_bit(uint8_t *bitmap, size_t bit)
 7 | {
 8 |     size_t idx = bit / 8;
 9 |     size_t pos = bit % 8;
10 | 
11 |     bitmap[idx] &= ~(1 << pos);
12 | }
13 | 
14 | static inline void bitmap_set_bit(uint8_t *bitmap, size_t bit)
15 | {
16 |     size_t idx = bit / 8;
17 |     size_t pos = bit % 8;
18 | 
19 |     bitmap[idx] |= 1 << pos;
20 | }
21 | 
22 | #endif /* UTIL_H */


--------------------------------------------------------------------------------
/hypervisor/memory/mem.h:
--------------------------------------------------------------------------------
 1 | #ifndef MEM_H
 2 | #define MEM_H
 3 | 
 4 | #include "platform/standard.h"
 5 | #include "ia32_compact.h"
 6 | 
 7 | enum copy_dir {
 8 |     COPY_READ,
 9 |     COPY_WRITE
10 | };
11 | 
12 | uintptr_t mem_va_to_pa(cr3 table, void *va);
13 | bool mem_copy_virt_tofrom_host(enum copy_dir dir, cr3 table,
14 |                              uintptr_t addr, void *buffer, size_t size);
15 | bool mem_copy_virt_to_virt(cr3 src_cr3, void *src, cr3 dest_cr3, void *dest, size_t size);
16 | 
17 | 
18 | #endif /* MEM_H */


--------------------------------------------------------------------------------
/interface/hypervisor/handler_if.h:
--------------------------------------------------------------------------------
 1 | #ifndef HANDLER_COMMON_H
 2 | #define HANDLER_COMMON_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include <stdbool.h>
 7 | 
 8 | typedef void (*vmexit_cbk_t)(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next);
 9 | 
10 | void handler_register_exit(struct handler_ctx *ctx,
11 |                            size_t exit_reason,
12 |                            vmexit_cbk_t callback,
13 |                            void *opaque,
14 |                            bool override);
15 | 
16 | #endif /* HANDLER_COMMON_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/shim.asm:
--------------------------------------------------------------------------------
 1 | section .text
 2 | 
 3 | global shim_guest_to_host
 4 | 
 5 | extern __capture_context
 6 | extern handler_guest_to_host
 7 | extern vmm_hyperjack_handler
 8 | 
 9 | shim_guest_to_host:
10 |     ; Save the RCX register and then load into RCX the value
11 |     ; where we will want to store our stack offsetting by the
12 |     ; push we just did to preserve RCX. This is then passed
13 |     ; as a parameter to capture_context so that the guest
14 |     ; context is stored within the host stack.
15 |     push rcx
16 |     lea rcx, [rsp + 08h]
17 |     call __capture_context
18 |     jmp handler_guest_to_host


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Compiler and linker specified for hypervisor
 3 | 
 4 | # Directories
 5 | ROOTDIR := $(shell pwd)
 6 | BUILDDIR := $(ROOTDIR)/build
 7 | OBJDIR := $(BUILDDIR)/obj
 8 | 
 9 | # Child makefile flags
10 | export ROOTDIR
11 | export OBJDIR
12 | export BUILDDIR
13 | 
14 | # Configuration for making all files.
15 | .PHONY: all
16 | all: prep_dirs
17 | 	$(MAKE) -C hypervisor
18 | 	$(MAKE) -C examples
19 | 
20 | # Configuration that creates directories needed
21 | prep_dirs:
22 | 	mkdir -p $(BUILDDIR)
23 | 	mkdir -p $(OBJDIR)
24 | 
25 | # Cleaning of unneeded files.
26 | .PHONY: clean
27 | clean:
28 | 	rm -rf $(OBJDIR)
29 | 	rm -rf $(BUILDDIR)


--------------------------------------------------------------------------------
/scripts/run-qemu-win10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TARGET_IMG=/vm/win10-uefi-dev.qcow2
 4 | TARGET_IMG_SNAP=$TARGET_IMG.snap
 5 | 
 6 | # Create a snapshot of the win10 image we want to use.
 7 | rm -f $TARGET_IMG_SNAP
 8 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP
 9 | 
10 | # To exit do Ctrl-A X
11 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 -d \
12 |     build/uefi.efi -f build/hypervisor.a -- \
13 |     -display gtk -enable-kvm -serial stdio -cpu host -smp 4 -m 8G \
14 |     -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \
15 |     -drive file=fat:rw:./build/


--------------------------------------------------------------------------------
/scripts/gdb-run-qemu-win10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TARGET_IMG=/vm/win10-uefi-dev.qcow2
 4 | TARGET_IMG_SNAP=$TARGET_IMG.snap
 5 | 
 6 | # Create a snapshot of the win10 image we want to use.
 7 | rm -f $TARGET_IMG_SNAP
 8 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP
 9 | 
10 | # To exit do Ctrl-A X
11 | ./submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 -d \
12 |     build/uefi.efi -f build/hypervisor.a -- \
13 |     -s -S \
14 |     -d int,cpu_reset \
15 |     -display gtk -enable-kvm -serial stdio -cpu host -smp 4 -m 8G \
16 |     -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \
17 |     -drive file=fat:rw:./build/


--------------------------------------------------------------------------------
/hypervisor/platform/spinlock.h:
--------------------------------------------------------------------------------
 1 | #ifndef SPINLOCK_H
 2 | #define SPINLOCK_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | typedef int spinlock_t;
 7 | 
 8 | static inline void spin_init(spinlock_t *lock)
 9 | {
10 |     *lock = 0;
11 | }
12 | 
13 | static inline void spin_lock(spinlock_t *lock)
14 | {
15 |     while (1) {
16 |         int zero = 0;
17 |         int one = 1;
18 |         if (__atomic_compare_exchange(lock, &zero, &one, 0,
19 |                                       __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
20 |             return;
21 |     }
22 | }
23 | 
24 | static inline void spin_unlock(spinlock_t *lock)
25 | {
26 |     int zero = 0;
27 |     __atomic_store(lock, &zero, __ATOMIC_SEQ_CST);
28 | }
29 | 
30 | #endif /* SPINLOCK_H */


--------------------------------------------------------------------------------
/interface/usermode/hypervisor.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "vmcall_if.h"
 3 | 
 4 | /*
 5 |  * Defines a standard C++ class for interfacing with
 6 |  * the hypervisor.
 7 |  */
 8 | class hypervisor {
 9 | 
10 |     /* Windows VEH registration and checking.
11 |      * we don't have access to __try & __except
12 |      * intrinsincs as is MSVC so we must use VEH. */
13 |     void register_exception_handler();
14 |     bool check_and_clear_exception();
15 | 
16 |     /* Performs the sending of a VMCALL to the hypervisor. */
17 |     bool send_call(vmcall_param &param);
18 | public:
19 |     hypervisor();
20 | 
21 |     /* Hypervisor specific actions. */
22 |     bool check_presence();
23 |     bool load_plugin(std::string file_name);
24 | };


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | # Build directory
55 | build/*
56 | 
57 | # IDE
58 | .vscode/*
59 | 


--------------------------------------------------------------------------------
/scripts/run-qemu-hvci-win10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TARGET_UEFI_IMAGE=$1
 4 | TARGET_IMG=/vm/hvci-win10-uefi-dev.qcow2
 5 | TARGET_IMG_SNAP=$TARGET_IMG.snap
 6 | 
 7 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 8 | 
 9 | # Create a snapshot of the win10 image we want to use.
10 | rm -f $TARGET_IMG_SNAP
11 | qemu-img create -f qcow2 -F qcow2 -b $TARGET_IMG $TARGET_IMG_SNAP
12 | 
13 | # To exit do Ctrl-A X
14 | $SCRIPT_DIR/../submodules/uefi-run/target/debug/uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/local/bin/qemu-system-x86_64 $TARGET_UEFI_IMAGE -- \
15 |     -display gtk -enable-kvm -serial stdio -cpu host -smp 1 -m 8G \
16 |     -drive file=$TARGET_IMG_SNAP,media=disk,if=ide,cache=off,index=1 \
17 |     -drive file=fat:rw:./build/


--------------------------------------------------------------------------------
/hypervisor/impl_hooks.h:
--------------------------------------------------------------------------------
 1 | #ifndef IMPL_HOOKS_H
 2 | #define IMPL_HOOKS_H
 3 | 
 4 | /*
 5 |  * This header defines all hooks (aka missing code) that
 6 |  * the user of this library needs to implement.
 7 |  *
 8 |  * This is to allow the hypervisor to successfully compile.
 9 |  *
10 |  * At time of linking of the library with the consuming application
11 |  * if none of these hooks are implemented you will get an error,
12 |  * aka implement them.
13 |  */
14 | #include "platform/standard.h"
15 | 
16 | /* Used so the hypervisor can run a specific callback on each physical processor. */
17 | bool impl_run_all_processors(__attribute__((ms_abi)) void (*callback)(void *opaque), void *opaque);
18 | 
19 | /* Used for retrieving the current processor index. */
20 | bool impl_get_processor_index(size_t *index);
21 | 
22 | #endif /* IMPL_HOOKS_H */


--------------------------------------------------------------------------------
/hypervisor/platform/serial.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdbool.h>
 3 | #include "serial.h"
 4 | #include "arch.h"
 5 | 
 6 | #define SERIAL_PORT 0x3F8 /* COM1 */
 7 | 
 8 | static bool is_tx_empty(void)
 9 | {
10 |     return (inb(SERIAL_PORT + 5) & 0x20) != 0;
11 | }
12 | 
13 | static void print_char(char c)
14 | {
15 |     while (!is_tx_empty()) {};
16 |     outb(SERIAL_PORT + 0, c);
17 | }
18 | 
19 | void serial_init(void)
20 | {
21 | 	outb(SERIAL_PORT + 1, 0x00);	/* Disable seiral interrupts. */
22 | 	outb(SERIAL_PORT + 3, 0x80);	/* Enable DLAB (set baud rate divisor). */
23 | 	outb(SERIAL_PORT + 0, 0x01);	/* Set divisor to 1 (lo byte) 115200/1 baud. */
24 | 	outb(SERIAL_PORT + 1, 0x00);	/*					(hi byte) */
25 | 	outb(SERIAL_PORT + 3, 0x03);	/* 8 bits, no parity, one stop bit. */
26 | 	outb(SERIAL_PORT + 2, 0xC7);	/* Enable FIFO, clear them with 14-byte threshold. */
27 | 	outb(SERIAL_PORT + 4, 0x0B);	/* IRQs enabled, RTS/DSR set. */
28 | 	outb(SERIAL_PORT + 4, 0x0F);
29 | 
30 | 	/* Clear the screen & set home position. */
31 | 	serial_print("\033[2J");
32 | 	serial_print("\033[H");
33 | }
34 | 
35 | void serial_print(char *str)
36 | {
37 |     for (int i = 0; str[i]; i++) {
38 |         print_char(str[i]);
39 |     }
40 | }


--------------------------------------------------------------------------------
/hypervisor/Makefile:
--------------------------------------------------------------------------------
 1 | FILE_NAME := $(shell basename `pwd`)
 2 | 
 3 | OUT_OBJ_DIR := $(OBJDIR)/$(FILE_NAME)
 4 | OUT_LIB_NAME := $(BUILDDIR)/$(FILE_NAME).a
 5 | 
 6 | CC := gcc
 7 | AR := ar
 8 | 
 9 | # CDEFINES := CONFIG_NESTED
10 | 
11 | CFLAGS := -fno-stack-protector		\
12 | 	-fcf-protection=none			\
13 | 	-mno-shstk						\
14 | 	-fdiagnostics-color				\
15 | 	-fshort-wchar					\
16 | 	-mno-sse						\
17 | 	-mno-red-zone					\
18 | 	-Wall							\
19 | 	-Wextra							\
20 | 	-Werror							\
21 | 	-I../submodules/ia32-doc/out/	\
22 | 	-I../interface/hypervisor/		\
23 | 	-I.								\
24 | 	-DPRINTF_DISABLE_SUPPORT_FLOAT	\
25 | 	$(addprefix -D, $(CDEFINES))
26 | 
27 | ASMFLAGS := -f elf64 \
28 | 	-Werror
29 | 
30 | # Use wildcards to gather all of the c files we need to find.
31 | C_FILES := $(shell find . -name '*.c')
32 | C_OBJ_FILES := $(patsubst %.c,%.o, $(C_FILES))
33 | 
34 | ASM_FILES := $(shell find . -name '*.asm')
35 | ASM_OBJ_FILES := $(patsubst %.asm,%.oasm, $(ASM_FILES))
36 | 
37 | .PHONY: all
38 | all: $(OUT_LIB_NAME)
39 | 
40 | %.o: %.c
41 | 	mkdir -p  $(OUT_OBJ_DIR)/$(dir $<)
42 | 	$(CC) -c -o $(OUT_OBJ_DIR)/$@ $< $(CFLAGS)
43 | 
44 | %.oasm: %.asm
45 | 	mkdir -p  $(OUT_OBJ_DIR)/$(dir $<)
46 | 	nasm $< -o $(OUT_OBJ_DIR)/$@ $(ASMFLAGS)
47 | 
48 | $(OUT_LIB_NAME): $(C_OBJ_FILES) $(ASM_OBJ_FILES)
49 | 	$(AR) rcs $@ $(addprefix $(OUT_OBJ_DIR)/, $^)
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cascade
 2 | ## A thin introspection hypervisor framework
 3 | 
 4 | This framework runs a thin hypervisor utilising Intel VT-x to allow granular control of a system.
 5 | 
 6 | As this is a introspection style hypervisor all system resources & devices are passed through to the
 7 | "virtualised" system once initialised. Therefore any operating system of choice can then be loaded
 8 | as if normal.
 9 | 
10 | Cascade gives the ability to register custom exit handlers for VMEXIT's generated by the guest as
11 | well as providing a custom VMCALL interface for registering and executing your own custom callbacks
12 | at VMROOT.
13 | 
14 | An example project is given as a UEFI runtime driver, allowing early boot introspection/blue-pilling
15 | of a system. This UEFI runtime driver does not do much apart from load the hypervisor, however it
16 | is an example as to how this library can be utilised.
17 | 
18 | ### Installation, Compilation & Testing (Ubuntu)
19 | 1. Ensure the following dependencies are installed on your system
20 | 
21 |     ```sudo apt-get install qemu qemu-utils ovmf gnu-efi binutils-mingw-w64 gcc-mingw-w64 xorriso mtools cargo```
22 |     ```cargo install uefi-run```
23 | 
24 | 2. To compile run a simple make command
25 |    
26 |    ```make -j $(nproc)```
27 | 
28 | 3. To run the build quickly in a QEMU instance use the EFI run tool (or use ```./run-qemu.sh```)
29 | 
30 |    ```uefi-run -b /usr/share/OVMF/OVMF_CODE.fd -q /usr/bin/qemu-system-x86_64 build/hypervisor.efi```


--------------------------------------------------------------------------------
/examples/uefi/elf_x86_64_efi.lds:
--------------------------------------------------------------------------------
 1 | /* Same as elf_x86_64_fbsd_efi.lds, except for OUTPUT_FORMAT below - KEEP IN SYNC */
 2 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
 3 | OUTPUT_ARCH(i386:x86-64)
 4 | ENTRY(_start)
 5 | SECTIONS
 6 | {
 7 |   . = 0;
 8 |   ImageBase = .;
 9 |   /* .hash and/or .gnu.hash MUST come first! */
10 |   .hash : { *(.hash) }
11 |   .gnu.hash : { *(.gnu.hash) }
12 |   . = ALIGN(4096);
13 |   .eh_frame : 
14 |   { 
15 |     *(.eh_frame)
16 |   }
17 |   . = ALIGN(4096);
18 |   .text :
19 |   {
20 |    _text = .;
21 |    *(.text)
22 |    *(.text.*)
23 |    *(.gnu.linkonce.t.*)
24 |    . = ALIGN(16);
25 |   }
26 |   _etext = .;
27 |   _text_size = . - _text;
28 |   . = ALIGN(4096);
29 |   .reloc :
30 |   {
31 |    *(.reloc)
32 |   }
33 |   . = ALIGN(4096);
34 |   .data :
35 |   {
36 |    _data = .;
37 |    *(.rodata*)
38 |    *(.got.plt)
39 |    *(.got)
40 |    *(.data*)
41 |    *(.sdata)
42 |   }
43 |   .note.gnu.build-id : { *(.note.gnu.build-id) }
44 | 
45 |   _edata = .;
46 |   _data_size = . - _etext;
47 |   . = ALIGN(4096);
48 |   .bss :
49 |   {
50 |    *(.sbss)
51 |    *(.scommon)
52 |    *(.dynbss)
53 |    *(.bss)
54 |    *(COMMON)
55 |    *(.rel.local)
56 |   }
57 |   . = ALIGN(4096);
58 |   .dynamic  : { *(.dynamic) }
59 |   . = ALIGN(4096);
60 |   .rela :
61 |   {
62 |     *(.rela.data*)
63 |     *(.rela.got)
64 |     *(.rela.stab)
65 |   }
66 |   . = ALIGN(4096);
67 |   .dynsym   : { *(.dynsym) }
68 |   . = ALIGN(4096);
69 |   .dynstr   : { *(.dynstr) }
70 |   . = ALIGN(4096);
71 |   .ignored.reloc :
72 |   {
73 |     *(.rela.reloc)
74 |     *(.eh_frame)
75 |     *(.note.GNU-stack)
76 |   }
77 |   .comment 0 : { *(.comment) }
78 | }
79 | 


--------------------------------------------------------------------------------
/examples/uefi/Makefile:
--------------------------------------------------------------------------------
 1 | FILE_NAME := $(shell basename `pwd`)
 2 | 
 3 | OUT_OBJ_DIR := $(OBJDIR)/$(FILE_NAME)
 4 | OUT_SO_NAME := $(OBJDIR)/$(FILE_NAME)/$(FILE_NAME).so
 5 | OUT_EFI_NAME := $(BUILDDIR)/$(FILE_NAME).efi
 6 | 
 7 | EFI_LIB_PATH := /usr/lib/
 8 | 
 9 | CC := gcc
10 | LD := ld
11 | OC := objcopy
12 | 
13 | CFLAGS := -fno-stack-protector			\
14 | 	-fcf-protection=none				\
15 | 	-mno-shstk							\
16 | 	-fshort-wchar						\
17 | 	-mno-sse							\
18 | 	-mno-red-zone						\
19 | 	-Wall								\
20 | 	-Wextra								\
21 | 	-Werror								\
22 | 	-I/usr/include/efi					\
23 | 	-I/usr/include/efi/x86_64			\
24 | 	-I../../submodules/ia32-doc/out/	\
25 | 	-I../../interface/hypervisor/		\
26 | 	-I.									\
27 | 	-DEFI_FUNCTION_WRAPPER				\
28 | 
29 | LDFLAGS := $(EFI_LIB_PATH)crt0-efi-x86_64.o	\
30 | 	-shared									\
31 | 	-nostdlib								\
32 | 	-znocombreloc							\
33 | 	-T elf_x86_64_efi.lds					\
34 | 	-Bsymbolic								\
35 | 	-L $(EFI_LIB_PATH)						\
36 | 	-l:libgnuefi.a							\
37 | 	-l:libefi.a								\
38 | 	-L $(BUILDDIR)							\
39 | 	-l:hypervisor.a							\
40 | 
41 | OCFLAGS := -j .text			\
42 | 	-j .sdata				\
43 | 	-j .data				\
44 | 	-j .bss					\
45 | 	-j .dynamic				\
46 | 	-j .dynsym				\
47 | 	-j .rel					\
48 | 	-j .rela				\
49 | 	-j .reloc				\
50 | 	--subsystem efi-rtd		\
51 | 	--target=efi-app-x86_64 \
52 | 
53 | # Use wildcards to gather all of the c files we need to find.
54 | C_FILES := $(shell find . -name '*.c')
55 | C_OBJ_FILES := $(patsubst %.c,%.o, $(C_FILES))
56 | 
57 | .PHONY: all
58 | all: $(OUT_EFI_NAME)
59 | 
60 | %.o: %.c
61 | 	mkdir -p  $(OUT_OBJ_DIR)/$(dir $<)
62 | 	$(CC) -c -o $(OUT_OBJ_DIR)/$@ $< $(CFLAGS)
63 | 
64 | $(OUT_SO_NAME): $(C_OBJ_FILES)
65 | 	$(LD) -o $@ $(addprefix $(OUT_OBJ_DIR)/, $^) $(LDFLAGS)
66 | 
67 | $(OUT_EFI_NAME): $(OUT_SO_NAME)
68 | 	rm -f $(OUT_EFI_NAME)
69 | 	$(OC) $(OCFLAGS) $^ $@


--------------------------------------------------------------------------------
/hypervisor/hypervisor.c:
--------------------------------------------------------------------------------
 1 | #include "platform/standard.h"
 2 | #include "platform/serial.h"
 3 | #include "memory/pmem.h"
 4 | #include "memory/vmem.h"
 5 | #include "interrupt/idt.h"
 6 | #include "vmm/vmm.h"
 7 | 
 8 | static void trigger_cpuid(void)
 9 | {
10 |     /* Running a CPUID should trigger an exit. */
11 |     uint64_t ticks_before = __rdtsc();
12 | 
13 |     uint64_t rax, rbx, rcx, rdx;
14 |     asm volatile (
15 |         "movl $0x40000000, %%eax;"
16 |         "cpuid;"
17 |         : "=a"(rax), "=b"(rbx), "=c"(rcx), "=d"(rdx)
18 |     );
19 |     uint64_t ticks_delta = __rdtsc() - ticks_before;
20 | 
21 |     debug_print("Test CPUID leaf=0x40000000 eax=0x%lX ebx=0x%lX ecx=0x%lX edx=0x%lX ticks=%ld",
22 |                 rax, rbx, rcx, rdx, ticks_delta);
23 | }
24 | 
25 | static void test_rdmsr(void)
26 | {
27 |     /* Reading MSRs may trigger a VM exit, if set in the bitmap. */
28 |     uint64_t ticks_before = __rdtsc();
29 |     uint64_t dummy_msr = rdmsr(IA32_TIME_STAMP_COUNTER);
30 |     uint64_t ticks_delta = __rdtsc() - ticks_before;
31 | 
32 |     debug_print("Test RDMSR dummy_val=0x%lX ticks=%ld", dummy_msr, ticks_delta);
33 | }
34 | 
35 | static void hypervisor_tests(void)
36 | {
37 |     trigger_cpuid();
38 |     test_rdmsr();
39 | }
40 | 
41 | void hypervisor_init(void)
42 | {
43 | 
44 | //#define DEBUG_IDA
45 | #ifdef DEBUG_IDA
46 |     static volatile int wait_debug = 0;
47 | 
48 |     while (!wait_debug) {}
49 | #endif
50 | 
51 |     /* Initialise all of the required modules and set up the parameters
52 |      * required for the VMM to start. */
53 |     struct vmm_init_params vmm_params = { 0 };
54 | 
55 |     serial_init();
56 |     pmem_init();
57 |     vmem_init(&vmm_params.guest_cr3, &vmm_params.host_cr3);
58 |     idt_init(&vmm_params.guest_idtr, &vmm_params.host_idtr);
59 |     vmm_init(&vmm_params);
60 | 
61 |     hypervisor_tests();
62 | 
63 |     debug_print("Hypervisor initialised.");
64 | }


--------------------------------------------------------------------------------
/hypervisor/platform/arch.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARCH_H
 2 | #define ARCH_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <cpuid.h>
 6 | 
 7 | /* Segment indexes. */
 8 | enum seg_idx{
 9 | 	SEG_ES = 0,
10 | 	SEG_CS,
11 | 	SEG_SS,
12 | 	SEG_DS,
13 | 	SEG_FS,
14 | 	SEG_GS
15 | };
16 | 
17 | /* CPUID ease of use. */
18 | struct cpuid_leaf_output {
19 |     uint32_t eax;
20 |     uint32_t ebx;
21 |     uint32_t ecx;
22 |     uint32_t edx;
23 | };
24 | 
25 | /* This has been created so that the ia_32_compact.h defines for CPUID leafs can be used
26 |  * without having to manually reconstruct each of the variables one by one. */
27 | #define CPUID_LEAF_READ(leaf, output) __get_cpuid(leaf, &output.eax.flags, &output.ebx.flags, \
28 |                                                    &output.ecx.flags, &output.edx.flags)
29 | 
30 | /* MSR and IO port handling helpers. */
31 | static inline uint64_t rdmsr(uint64_t msr)
32 | {
33 | 	uint32_t low, high;
34 | 	asm volatile (
35 | 		"rdmsr"
36 | 		: "=a"(low), "=d"(high)
37 | 		: "c"(msr)
38 | 	);
39 | 	return ((uint64_t)high << 32) | low;
40 | }
41 | 
42 | static inline void wrmsr(uint64_t msr, uint64_t value)
43 | {
44 | 	uint32_t low = value & 0xFFFFFFFF;
45 | 	uint32_t high = value >> 32;
46 | 	asm volatile (
47 | 		"wrmsr"
48 | 		:
49 | 		: "c"(msr), "a"(low), "d"(high)
50 | 	);
51 | }
52 | 
53 | static inline void outb(uint16_t port, uint8_t val)
54 | {
55 |     asm volatile ( "outb %0, %1" : : "a"(val), "Nd"(port) );
56 | }
57 | 
58 | static inline uint8_t inb(uint16_t port)
59 | {
60 |     uint8_t ret;
61 |     asm volatile ( "inb %1, %0"
62 |                    : "=a"(ret)
63 |                    : "Nd"(port) );
64 |     return ret;
65 | }
66 | 
67 | /* Architecture specific register defines. */
68 | #define CR4_VMXE_SHIFT 13ull
69 | #define CR4_VMXE_MASK (1ull << CR4_VMXE_SHIFT)
70 | #define CR4_LA57_SHIFT 12ull
71 | #define CR4_LA57_MASK (1ull << CR4_LA57_SHIFT)
72 | 
73 | #endif /* ARCH_H */


--------------------------------------------------------------------------------
/examples/uefi/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdbool.h>
 4 | #include <efi.h>
 5 | #include <efilib.h>
 6 | #include "hypervisor.h"
 7 | #include "mp_service.h"
 8 | 
 9 | static EFI_MP_SERVICES_PROTOCOL *mp_protocol = NULL;
10 | 
11 | /*
12 |  * Definitions of the callback hooks that are required to be implemented
13 |  * by the system. In UEFI we implement these via EFI_MP_SERVICES.
14 |  */
15 | bool impl_run_all_processors(__attribute__((ms_abi)) void (*callback)(void *opaque), void *opaque)
16 | {
17 |     EFI_STATUS status;
18 |     UINTN proc_count;
19 |     UINTN enabled_procs;
20 | 
21 |     status = uefi_call_wrapper(mp_protocol->GetNumberOfProcessors, 3,
22 |                                mp_protocol, &proc_count, &enabled_procs);
23 |     if (status)
24 |         return false;
25 | 
26 |     /* Call on this processor first. */
27 |     callback(opaque);
28 | 
29 |     /* Call on other processors now. */
30 |     if (enabled_procs > 1) {
31 |         status = uefi_call_wrapper(mp_protocol->StartupAllAPs, 7,
32 |                                    mp_protocol, (EFI_AP_PROCEDURE)callback,
33 |                                    true, NULL, 0, opaque, NULL);
34 |         if (status)
35 |             return false;
36 |     }
37 | 
38 |     return true;
39 | }
40 | 
41 | bool impl_get_processor_index(size_t *index)
42 | {
43 |     EFI_STATUS status = uefi_call_wrapper(mp_protocol->WhoAmI, 2, mp_protocol, (UINTN *)index);
44 |     return (status == EFI_SUCCESS);
45 | }
46 | 
47 | EFI_STATUS EFIAPI efi_main (EFI_HANDLE image_handle, EFI_SYSTEM_TABLE *system_table)
48 | {
49 |     static const EFI_GUID MP_GUID = EFI_MP_SERVICES_PROTOCOL_GUID;
50 | 
51 |     InitializeLib(image_handle, system_table);
52 | 
53 |     /* Locate the MP protocol so we can fill in our hooks for the hypervisor. */
54 |     EFI_STATUS status = uefi_call_wrapper(system_table->BootServices->LocateProtocol, 3,
55 |         &MP_GUID, NULL, &mp_protocol);
56 | 
57 |     if (status)
58 |         return status;
59 | 
60 | 
61 |     hypervisor_init();
62 | 
63 |     return EFI_SUCCESS;
64 | }


--------------------------------------------------------------------------------
/hypervisor/platform/intrin.h:
--------------------------------------------------------------------------------
 1 | #ifndef INTRIN_H
 2 | #define INTRIN_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "platform/standard.h"
 6 | 
 7 | extern __attribute__((ms_abi)) uint16_t __readcs();
 8 | extern __attribute__((ms_abi)) uint64_t __readcr0();
 9 | extern __attribute__((ms_abi)) uint64_t __readcr2();
10 | extern __attribute__((ms_abi)) uint64_t __readcr3();
11 | extern __attribute__((ms_abi)) uint64_t __readcr4();
12 | extern __attribute__((ms_abi)) uint64_t __readdr7();
13 | extern __attribute__((ms_abi)) uint64_t __rdtsc(void);
14 | extern __attribute__((ms_abi)) void __writecr0(uint64_t cr0);
15 | extern __attribute__((ms_abi)) void __writecr3(uint64_t cr3);
16 | extern __attribute__((ms_abi)) void __writecr4(uint64_t cr4);
17 | extern __attribute__((ms_abi)) void __lidt(void *idt);
18 | extern __attribute__((ms_abi)) void __sidt(void *idt);
19 | extern __attribute__((ms_abi)) void __lgdt(void *gdt);
20 | extern __attribute__((ms_abi)) void __sgdt(void *gdt);
21 | extern __attribute__((ms_abi)) void __lldt(void *ldt);
22 | extern __attribute__((ms_abi)) void __sldt(void *ldt);
23 | extern __attribute__((ms_abi)) void __str(void *tr);
24 | extern __attribute__((ms_abi)) void __ltr(void *tr);
25 | extern __attribute__((ms_abi)) void __xsetbv(uint64_t field, uint64_t val);
26 | extern __attribute__((ms_abi)) void __invd(void);
27 | extern __attribute__((ms_abi)) void __invlpg(uintptr_t *addr);
28 | extern __attribute__((ms_abi)) void __invept(uint64_t ext, void *addr);
29 | extern __attribute__((ms_abi)) int __vmxon(void *vmxon);
30 | extern __attribute__((ms_abi)) int __vmclear(void *vmcs);
31 | extern __attribute__((ms_abi)) int __vmptrld(void *vmcs);
32 | extern __attribute__((ms_abi)) void __vmwrite(size_t field, size_t value);
33 | extern __attribute__((ms_abi)) size_t __vmread(size_t field);
34 | extern __attribute__((ms_abi)) int __vmlaunch(void);
35 | extern __attribute__((ms_abi, noreturn)) void __vmresume(void);
36 | extern __attribute__((ms_abi)) void __capture_context(void *context);
37 | extern __attribute__((ms_abi)) void __restore_context(void *context);
38 | 
39 | #endif /* INTRIN_H */


--------------------------------------------------------------------------------
/hypervisor/interrupt/idt.asm:
--------------------------------------------------------------------------------
  1 | ; Based upon Satoshi Tandasat's IDT handling stub/asm file.
  2 | 
  3 | section .text
  4 | 
  5 | extern idt_exception_handler
  6 | 
  7 | %macro pushaq 0
  8 |     push rax
  9 |     push rcx
 10 |     push rdx
 11 |     push rbx
 12 |     push rbp
 13 |     push rsi
 14 |     push rdi
 15 |     push r8
 16 |     push r9
 17 |     push r10
 18 |     push r11
 19 |     push r12
 20 |     push r13
 21 |     push r14
 22 |     push r15
 23 | %endmacro
 24 | 
 25 | %macro popaq 0
 26 |     pop r15
 27 |     pop r14
 28 |     pop r13
 29 |     pop r12
 30 |     pop r11
 31 |     pop r10
 32 |     pop r9
 33 |     pop r8
 34 |     pop rdi
 35 |     pop rsi
 36 |     pop rbp
 37 |     pop rbx
 38 |     pop rdx
 39 |     pop rcx
 40 |     pop rax
 41 | %endmacro
 42 | 
 43 | %macro isr_err_stub 1
 44 | isr_stub_%+%1:
 45 |     cli
 46 |     push %1
 47 |     jmp common_exception_handler
 48 | %endmacro
 49 | 
 50 | %macro isr_no_err_stub 1
 51 | isr_stub_%+%1:
 52 |     cli
 53 |     push 0
 54 |     push %1
 55 |     jmp common_exception_handler
 56 | %endmacro
 57 | 
 58 | isr_no_err_stub 0
 59 | isr_no_err_stub 1
 60 | isr_no_err_stub 2
 61 | isr_no_err_stub 3
 62 | isr_no_err_stub 4
 63 | isr_no_err_stub 5
 64 | isr_no_err_stub 6
 65 | isr_no_err_stub 7
 66 | isr_err_stub    8
 67 | isr_no_err_stub 9
 68 | isr_err_stub    10
 69 | isr_err_stub    11
 70 | isr_err_stub    12
 71 | isr_err_stub    13
 72 | isr_err_stub    14
 73 | isr_no_err_stub 15
 74 | isr_no_err_stub 16
 75 | isr_err_stub    17
 76 | isr_no_err_stub 18
 77 | isr_no_err_stub 19
 78 | isr_no_err_stub 20
 79 | isr_no_err_stub 21
 80 | isr_no_err_stub 22
 81 | isr_no_err_stub 23
 82 | isr_no_err_stub 24
 83 | isr_no_err_stub 25
 84 | isr_no_err_stub 26
 85 | isr_no_err_stub 27
 86 | isr_no_err_stub 28
 87 | isr_no_err_stub 29
 88 | isr_err_stub    30
 89 | isr_no_err_stub 31
 90 | %assign i 32
 91 | %rep 224
 92 |     isr_no_err_stub i
 93 | %assign i i+1
 94 | %endrep
 95 | 
 96 | common_exception_handler:
 97 |     pushaq
 98 |     mov rdi, rsp
 99 |     call idt_exception_handler
100 |     popaq
101 |     add rsp, 10h
102 |     sti
103 |     iretq
104 | 
105 | section .data
106 | global interrupt_vector_table
107 | interrupt_vector_table:
108 | %assign i 0
109 | %rep    256
110 |     dq isr_stub_%+i ; Using DQ as we are x64
111 | %assign i i+1
112 | %endrep


--------------------------------------------------------------------------------
/interface/hypervisor/vmcall_if.h:
--------------------------------------------------------------------------------
 1 | #ifndef VMCALL_IF_H
 2 | #define VMCALL_IF_H
 3 | 
 4 | /*
 5 |  * As cascade is an introspection framework we want to be able to
 6 |  * control the introspection + host capabilities from within
 7 |  * guest applications.
 8 |  * 
 9 |  * This VMCALL interface gives applications within the guest
10 |  * rudimentary ability to perform such actions.
11 |  */
12 | #include <stddef.h>
13 | #include <stdint.h>
14 | #include <stdbool.h>
15 | 
16 | /*
17 |  * Secret key which the guest needs to utilise to allow for
18 |  * accessing the VMCALL interface.
19 |  */
20 | #define VMCALL_SECRET_KEY ((size_t)0x0CA5CADE)
21 | 
22 | /*
23 |  * Generic VMCALL actions that the hypervisor provides.
24 |  */
25 | #define VMCALL_ACTION_CHECK_PRESENCE (0ull)
26 | 
27 | /* Definition of an action identifier for a VMCALL. */
28 | typedef size_t vmcall_id_t;
29 | 
30 | /* Definition of a return status code for a VMCALL. */
31 | typedef size_t vmcall_status_t;
32 | 
33 | #define VMCALL_STATUS_OK 0ull
34 | #define VMCALL_STATUS_INVALID_PARAM 1ull
35 | #define VMCALL_STATUS_INVALID_ID 2ull
36 | #define VMCALL_STATUS_INTERNAL_ERROR 3ull
37 | 
38 | /*
39 |  * Definition of a VMCALL exit handler callback.
40 |  * buffer is the HOST copy of the buffer provided in
41 |  * the vmcall_param. Upon completion of the callback
42 |  * this will then get copied back into guest context.
43 |  *
44 |  * Opaque is whatever was passed in when registering
45 |  * the VMCALL event by the hypervisor.
46 |  *
47 |  * Return value size_t is a status code for the VMCALL.
48 |  */
49 | typedef vmcall_status_t (*vmcall_cbk_t)(uint8_t *buffer, void *opaque);
50 | 
51 | /*
52 |  * Definition of the parameter struct a guest uses when performing a
53 |  * VMCALL to the hypervisor.
54 |  */
55 | struct vmcall_param {
56 |     /* The unique identifier of the action to call. */
57 |     vmcall_id_t id;
58 |     /*
59 |      * Extra buffer space for a vmcall parameter.
60 |      * This can be utilised for storing extra data
61 |      * to be communicated between host <-> guest on
62 |      * the VMCALL. Statically fixed to this size to
63 |      * just make life easier when dealing with reading
64 |      * memory to prevent having to alloc & free all
65 |      * the time.
66 |      */
67 |     uint8_t buffer[4096];
68 | };
69 | 
70 | void vmcall_register_action(struct vmcall_ctx *ctx,
71 |                             vmcall_id_t id,
72 |                             vmcall_cbk_t callback,
73 |                             void *opaque);
74 | 
75 | #endif /* VMCALL_IF_H */


--------------------------------------------------------------------------------
/interface/usermode/hypervisor.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <vector>
  4 | #include <windows.h>
  5 | #include "hypervisor.hpp"
  6 | 
  7 | static bool g_call_fail = false;
  8 | 
  9 | static long exception_handler(_EXCEPTION_POINTERS *info)
 10 | {
 11 |     g_call_fail = true;
 12 |     std::cout << "Exception handler called.\n";
 13 |     return EXCEPTION_CONTINUE_EXECUTION;
 14 | }
 15 | 
 16 | void hypervisor::register_exception_handler()
 17 | {
 18 |     AddVectoredExceptionHandler(1 /* FIRST */, exception_handler);
 19 | }
 20 | 
 21 | bool hypervisor::check_and_clear_exception()
 22 | {
 23 |     bool result = g_call_fail;
 24 |     g_call_fail = false;
 25 |     return result;
 26 | }
 27 | 
 28 | bool hypervisor::send_call(vmcall_param &param)
 29 | {
 30 |     /*
 31 |      * Send the action via the VMCALL interface and check
 32 |      * the VEH global g_call_fail to see whether the HW
 33 |      * responded with a #UD or other fault.
 34 |      */
 35 |     size_t result;
 36 |     asm volatile
 37 |     (
 38 |         "vmcall\n\t"
 39 |         : "=a"(result)
 40 |         : "c"(VMCALL_SECRET_KEY), "d"(&param)
 41 |     );
 42 | 
 43 |     return !check_and_clear_exception() && (result == 0);
 44 | }
 45 | 
 46 | bool hypervisor::check_presence()
 47 | {
 48 |     /*
 49 |      * As it's possible the cascade hypervisor is
 50 |      * hiding it's presence we use the defined VMCALL
 51 |      * interface with secret key to query rather than
 52 |      * attempting to check presence via VMXE or CPUID
 53 |      * hypervisor leafs.
 54 |      */
 55 |     vmcall_param param = {};
 56 |     param.action = ACTION_CHECK_PRESENCE;
 57 |     return send_call(param);
 58 | }
 59 | 
 60 | bool hypervisor::load_plugin(std::string file_name)
 61 | {
 62 |     /* Load the plugin into this process dynamically then store image start. */
 63 |     HMODULE handle_plugin = LoadLibraryA(file_name.c_str());
 64 |     if (!handle_plugin) {
 65 |         std::cout << "Unable to load " << file_name << " into plugin loader.\n";
 66 |     }
 67 | 
 68 |     /*
 69 |      * It is not guaranteed that every page within the loaded image is currently
 70 |      * mapped into the process due to paging. As the hypervisor CANNOT deal with
 71 |      * paged out pages we attempt to read every page to attempt to get them all
 72 |      * present within memory.
 73 |      *
 74 |      * We can guarantee that every page within a DLL is readable.
 75 |      */
 76 |     uint8_t *raw_image = reinterpret_cast<uint8_t *>(handle_plugin);
 77 |     PIMAGE_DOS_HEADER idh = reinterpret_cast<PIMAGE_DOS_HEADER>(raw_image);
 78 |     PIMAGE_NT_HEADERS inh = reinterpret_cast<PIMAGE_NT_HEADERS>(&raw_image[idh->e_lfanew]);
 79 | 
 80 |     for (size_t i = 0; i < inh->OptionalHeader.SizeOfImage; i += 0x1000) {
 81 |         /* Perform a bogus read of the page, using the if Sleep to prevent
 82 |          * optimisation. */
 83 |         if (raw_image[i])
 84 |             Sleep(0);
 85 |     }
 86 | 
 87 |     /* Set up the plugin loading action pointing to raw plugin bytes + size. */
 88 |     vmcall_param_load_plugin plugin_param = {};
 89 |     plugin_param.plugin = raw_image;
 90 | 
 91 |     /* Set up the main vmcall action pointing to our plugin parameters. */
 92 |     vmcall_param param = {};
 93 |     param.action = ACTION_LOAD_PLUGIN;
 94 |     param.param = &plugin_param;
 95 |     param.param_size = sizeof(plugin_param);
 96 |     return send_call(param);
 97 | }
 98 | 
 99 | hypervisor::hypervisor()
100 | {
101 |     /* Register VEH so we can catch #UD's on send failure. */
102 |     register_exception_handler();
103 | }


--------------------------------------------------------------------------------
/hypervisor/platform/standard.h:
--------------------------------------------------------------------------------
  1 | #ifndef STANDARD_H
  2 | #define STANDARD_H
  3 | 
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <stddef.h>
  7 | #include <stdint.h>
  8 | #include <stdbool.h>
  9 | #include "printf/printf.h"
 10 | #include "arch.h"
 11 | #include "serial.h"
 12 | #include "spinlock.h"
 13 | #include "intrin.h"
 14 | 
 15 | /* Size definitions */
 16 | #define GiB(x) ((size_t)(x) << 30)
 17 | #define MiB(x) ((size_t)(x) << 20)
 18 | #define KiB(x) ((size_t)(x) << 10)
 19 | 
 20 | /* Architecture definitions. */
 21 | #define PAGE_SIZE 0x1000
 22 | #define PAGE_MASK (PAGE_SIZE - 1)
 23 | 
 24 | /* Paging specific level masks. */
 25 | #define ADDRMASK_PML4_INDEX(addr)   (((size_t)addr & 0xFF8000000000ULL) >> 39)
 26 | #define ADDRMASK_PDPTE_INDEX(addr)  (((size_t)addr & 0x7FC0000000ULL) >> 30)
 27 | #define ADDRMASK_PDE_INDEX(addr)    (((size_t)addr & 0x3FE00000ULL) >> 21)
 28 | #define ADDRMASK_PTE_INDEX(addr)    (((size_t)addr & 0x1FF000ULL) >> 12)
 29 | 
 30 | #define ADDRMASK_PDPTE_OFFSET(addr)    ((size_t)addr & 0x3FFFFFFFULL)
 31 | #define ADDRMASK_PDE_OFFSET(addr)      ((size_t)addr & 0x1FFFFFULL)
 32 | #define ADDRMASK_PTE_OFFSET(addr)       ((size_t)addr & 0xFFFULL)
 33 | 
 34 | /* EPT/SLAT specific level masks. */
 35 | #define ADDRMASK_EPT_PML4_INDEX(addr) (((size_t)addr & 0xFF8000000000ULL) >> 39)
 36 | #define ADDRMASK_EPT_PML3_INDEX(addr) (((size_t)addr & 0x7FC0000000ULL) >> 30)
 37 | #define ADDRMASK_EPT_PML2_INDEX(addr) (((size_t)addr & 0x3FE00000ULL) >> 21)
 38 | #define ADDRMASK_EPT_PML1_INDEX(addr) (((size_t)addr & 0x1FF000ULL) >> 12)
 39 | #define ADDRMASK_EPT_PML1_OFFSET(addr) ((size_t)addr & 0xFFFULL)
 40 | 
 41 | /* Utility macros */
 42 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 43 | #define NUMBER_BITS_TYPE(type) (sizeof(type) * 8)
 44 | 
 45 | typedef uintptr_t gpa_t;
 46 | typedef uintptr_t hva_t;
 47 | 
 48 | static inline void wait_for_debugger(void)
 49 | {
 50 |     /*
 51 |      * Just some crappy while != 0 loop which the debugger must explicitly
 52 |      * clear. This is ONLY to be used when we need to attach with GDB and
 53 |      * figure something out. Should not be used for anything else.
 54 |      *
 55 |      * When the debugger reaches this point, you can then simply run:
 56 |      * "set $eax=0" to step through and continue with what you need.
 57 |      *
 58 |      * cli/sti are used to prevent any interrupts during waiting.
 59 |      */
 60 |     static volatile int wait_clear;
 61 | 
 62 |     wait_clear = 1;
 63 | 
 64 |     asm volatile ("cli");
 65 | 
 66 |     while (wait_clear)
 67 |         asm volatile ( "pause" );
 68 | 
 69 |     asm volatile ("sti");
 70 | }
 71 | 
 72 | /* Debug printing */
 73 | static inline void print_buffer(const char *format, ...)
 74 | {
 75 |     static spinlock_t sync_lock = 0;
 76 |     va_list marker;
 77 |     char tmp_buff[512] = { 0 };
 78 | 
 79 |     spin_lock(&sync_lock);
 80 |     va_start(marker, format);
 81 |     vsnprintf(tmp_buff, sizeof(tmp_buff), format, marker);
 82 |     va_end(marker);
 83 |     serial_print(tmp_buff);
 84 |     spin_unlock(&sync_lock);
 85 | }
 86 | 
 87 | #define debug_print(format, ...) \
 88 |     do { \
 89 |         char tmp_buff[512] = { 0 }; \
 90 |         snprintf(tmp_buff, sizeof(tmp_buff), "[0x%lX] %s %s (L%04d) - %s \r\n", __rdtsc(), __FILE__, __func__, __LINE__, format); \
 91 |         print_buffer(tmp_buff, ##__VA_ARGS__); \
 92 |     } while (0)
 93 | 
 94 | #define die_on(cond, ...) do { \
 95 |         if (cond) { \
 96 |             debug_print(__VA_ARGS__); \
 97 |             while (1) {} \
 98 |         } \
 99 |     } while (0)
100 | 
101 | #define assert(cond) die_on(!(cond), "assertion failed.");
102 | 
103 | #ifdef DEBUG_MODULE
104 |     #define DEBUG_PRINT(...) debug_print(__VA_ARGS__)
105 | #else
106 |     #define DEBUG_PRINT(...)
107 | #endif
108 | 
109 | 
110 | #endif /* STANDARD_H */


--------------------------------------------------------------------------------
/hypervisor/vmm/vmm_reg.h:
--------------------------------------------------------------------------------
  1 | #ifndef VMM_REG_H
  2 | #define VMM_REG_H
  3 | 
  4 | #include "ia32_compact.h"
  5 | 
  6 | struct control_registers {
  7 |     cr0 reg_cr0;
  8 |     cr3 reg_cr3;
  9 |     cr4 reg_cr4;
 10 |     uintptr_t gs_base;
 11 |     ia32_debugctl_register debugctl;
 12 |     uintptr_t dr7;
 13 | };
 14 | 
 15 | struct __attribute__ ((aligned (16))) m128a {
 16 |     uint64_t low;
 17 |     int64_t high;
 18 | };
 19 | 
 20 | struct __attribute__ ((aligned (16))) xmm_save_area32
 21 | {
 22 |     uint16_t control_word;
 23 |     uint16_t status_word;
 24 |     uint8_t tag_word;
 25 |     uint8_t reserved1;
 26 |     uint16_t error_opcode;
 27 |     uint32_t error_offset;
 28 |     uint16_t error_selector;
 29 |     uint16_t reserved2;
 30 |     uint32_t data_offset;
 31 |     uint16_t data_selector;
 32 |     uint16_t reserved3;
 33 |     uint32_t mx_csr;
 34 |     uint32_t mx_csr_mask;
 35 |     struct m128a float_registers[8];
 36 |     struct m128a xmm_registers[16];
 37 |     uint8_t reserved4[96];
 38 | };
 39 | 
 40 | struct __attribute__ ((aligned (16))) vcpu_context {
 41 |     uint64_t p1_home;
 42 |     uint64_t p2_home;
 43 |     uint64_t p3_home;
 44 |     uint64_t p4_home;
 45 |     uint64_t p5_home;
 46 |     uint64_t p6_home;
 47 |     uint32_t context_flags;
 48 |     uint32_t mx_csr;
 49 |     uint16_t seg_cs;
 50 |     uint16_t seg_ds;
 51 |     uint16_t seg_es;
 52 |     uint16_t seg_fs;
 53 |     uint16_t seg_gs;
 54 |     uint16_t seg_ss;
 55 |     uint32_t e_flags;
 56 |     uint64_t dr0;
 57 |     uint64_t dr1;
 58 |     uint64_t dr2;
 59 |     uint64_t dr3;
 60 |     uint64_t dr6;
 61 |     uint64_t dr7;
 62 |     uint64_t rax;
 63 |     uint64_t rcx;
 64 |     uint64_t rdx;
 65 |     uint64_t rbx;
 66 |     uint64_t rsp;
 67 |     uint64_t rbp;
 68 |     uint64_t rsi;
 69 |     uint64_t rdi;
 70 |     uint64_t r8;
 71 |     uint64_t r9;
 72 |     uint64_t r10;
 73 |     uint64_t r11;
 74 |     uint64_t r12;
 75 |     uint64_t r13;
 76 |     uint64_t r14;
 77 |     uint64_t r15;
 78 |     uint64_t rip;
 79 |     union
 80 |     {
 81 |         struct xmm_save_area32 flt_save;
 82 |         struct
 83 |         {
 84 |             struct m128a header[2];
 85 |             struct m128a legacy[8];
 86 |             struct m128a xmm0;
 87 |             struct m128a xmm1;
 88 |             struct m128a xmm2;
 89 |             struct m128a xmm3;
 90 |             struct m128a xmm4;
 91 |             struct m128a xmm5;
 92 |             struct m128a xmm6;
 93 |             struct m128a xmm7;
 94 |             struct m128a xmm8;
 95 |             struct m128a xmm9;
 96 |             struct m128a xmm10;
 97 |             struct m128a xmm11;
 98 |             struct m128a xmm12;
 99 |             struct m128a xmm13;
100 |             struct m128a xmm14;
101 |             struct m128a xmm15;
102 |         };
103 |     };
104 |     struct m128a vector_register[26];
105 |     uint64_t vector_control;
106 |     uint64_t debug_control;
107 |     uint64_t last_branch_to_rip;
108 |     uint64_t last_branch_from_rip;
109 |     uint64_t last_exception_to_rip;
110 |     uint64_t last_exception_from_rip;
111 | };
112 | 
113 | #pragma pack(push, 1)
114 | struct task_state_segment_64
115 | {
116 | 	uint32_t reserved0;
117 | 	uint64_t rsp0;
118 | 	uint64_t rsp1;
119 | 	uint64_t rsp2;
120 | 	uint64_t reserved1;
121 | 	uint64_t ist[7];
122 | 	uint64_t reserved3;
123 | 	uint16_t reserved4;
124 | 	uint16_t io_map_base_address;
125 | };
126 | #pragma pack(pop)
127 | 
128 | struct gdt_config {
129 |     __attribute__ ((aligned (16))) segment_descriptor_64 host_gdt[32];
130 |     __attribute__ ((aligned (16))) segment_descriptor_register_64 guest_gdtr;
131 |     __attribute__ ((aligned (16))) segment_descriptor_register_64 host_gdtr;
132 |     segment_selector guest_ldtr;
133 |     segment_selector host_tr;
134 |     struct task_state_segment_64 host_tss;
135 | };
136 | 
137 | #endif /* VMM_REG_H */


--------------------------------------------------------------------------------
/hypervisor/interrupt/idt.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "platform/intrin.h"
  4 | #include "vmm/vmm_common.h"
  5 | #include "idt.h"
  6 | 
  7 | struct idt_entry {
  8 |     uint16_t offset_15_to_0;
  9 |     uint16_t segment_selector;
 10 |     uint8_t ist : 3;
 11 |     uint8_t reserved_0 : 5;
 12 |     uint8_t gate_type : 4;
 13 |     uint8_t reserved_1 : 1;
 14 |     uint8_t dpl : 2;
 15 |     uint8_t present : 1;
 16 |     uint16_t offset_31_to_16;
 17 |     uint32_t offset_63_to_32;
 18 |     uint32_t reserved_2;
 19 | } __attribute__((packed));
 20 | 
 21 | struct exception_stack {
 22 | 	uint64_t r15;
 23 | 	uint64_t r14;
 24 | 	uint64_t r13;
 25 | 	uint64_t r12;
 26 | 	uint64_t r11;
 27 | 	uint64_t r10;
 28 | 	uint64_t r9;
 29 | 	uint64_t r8;
 30 | 	uint64_t rdi;
 31 | 	uint64_t rsi;
 32 | 	uint64_t rbp;
 33 | 	uint64_t rbx;
 34 | 	uint64_t rdx;
 35 | 	uint64_t rcx;
 36 | 	uint64_t rax;
 37 | 	uint64_t interrupt_number;
 38 | 	uint64_t error_code;
 39 | 	uint64_t rip;
 40 | 	uint64_t cs;
 41 | 	rfl r_flags;
 42 | };
 43 | 
 44 | #define IDT_ENTRY_COUNT 256
 45 | 
 46 | /* The IDT handler function, this is written in NASM rather
 47 |  * than in C as we need full control of what goes on. */
 48 | extern void *interrupt_vector_table[];
 49 | 
 50 | /* The descriptor table that holds an IDT entry for each vector. */
 51 | __attribute__((aligned(0x10))) static struct idt_entry idt_table[IDT_ENTRY_COUNT] = { 0 };
 52 | 
 53 | /* Holds any interrupts caught in HOST that need to be forwarded to guest. */
 54 | struct cached_interrupt cached_int = { 0 };
 55 | 
 56 | static void set_entry(uint8_t vector, void *isr, uint8_t gate_type)
 57 | {
 58 |     struct idt_entry *entry = &idt_table[vector];
 59 | 
 60 |     entry->offset_15_to_0 = (uint16_t)((uintptr_t)isr);
 61 |     entry->segment_selector = __readcs();
 62 |     entry->ist = 0;
 63 |     entry->reserved_0 = 0;
 64 |     entry->gate_type = gate_type;
 65 |     entry->reserved_1 = 0;
 66 |     entry->dpl = 0;
 67 |     entry->present = true;
 68 |     entry->offset_31_to_16 = (uint16_t)((uintptr_t)isr >> 16);
 69 |     entry->offset_63_to_32 = (uint32_t)((uintptr_t)isr >> 32);
 70 |     entry->reserved_2 = 0;
 71 | }
 72 | 
 73 | /* The exception handler that the common IDT stub function will call. */
 74 | void idt_exception_handler(const struct exception_stack *stack)
 75 | {
 76 |     (void)stack;
 77 | 
 78 |     /* If it is an interrupt that is device specific we should deal with this properly. */
 79 |     die_on(stack->interrupt_number < 0x20, "Unhandled interrupt rip %lX vec 0x%X[%d] err 0x%X cr2=0x%lX",
 80 |            stack->rip,
 81 |            stack->interrupt_number,
 82 |            stack->interrupt_number,
 83 |            stack->error_code, __readcr2());
 84 | 
 85 |     /*
 86 |      * Set the pending interrupt within the VMM, on this vCPU's next
 87 |      * VMENTER the interrupt will be delivered to the guest.
 88 |      */
 89 |     exception_error_code ec = { 0 };
 90 |     ec.index = (uint32_t)stack->error_code;
 91 |     vmm_set_cached_interrupt((exception_vector)stack->interrupt_number, ec);
 92 | }
 93 | 
 94 | void idt_init(segment_descriptor_register_64 *orig_idtr, segment_descriptor_register_64 *new_idtr)
 95 | {
 96 |     /* Store the original IDTR. */
 97 |     __sidt(orig_idtr);
 98 |     DEBUG_PRINT("Original IDTR base_addr %lX limit %X",
 99 |               orig_idtr->base_address, orig_idtr->limit);
100 | 
101 |     /* Create the IDTR. */
102 |     new_idtr->base_address = (uintptr_t)&idt_table[0];
103 |     new_idtr->limit = (uint16_t)sizeof(struct idt_entry) * IDT_ENTRY_COUNT - 1;
104 | 
105 |     /* Fill out all of the IDT entries with their relevant stubs. */
106 |     for (int i = 0; i < IDT_ENTRY_COUNT; i++) {
107 |         set_entry(i, interrupt_vector_table[i], SEGMENT_DESCRIPTOR_TYPE_INTERRUPT_GATE);
108 |     }
109 | 
110 |     DEBUG_PRINT("New IDTR base_addr %lX limit %X",
111 |               new_idtr->base_address, new_idtr->limit);
112 | }
113 | 


--------------------------------------------------------------------------------
/hypervisor/platform/nt.h:
--------------------------------------------------------------------------------
  1 | #ifndef NT_H
  2 | #define NT_H
  3 | 
  4 | #include "standard.h"
  5 | 
  6 | /* Core Windows X64 defines, this is included as will be used for plugins. */
  7 | #pragma pack(push, 1)
  8 | 
  9 | #define IMAGE_DOS_SIGNATURE 0x5A4D
 10 | #define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x20b
 11 | #define IMAGE_NUMBEROF_DIRECTORY_ENTRIES 16
 12 | 
 13 | #define IMAGE_DIRECTORY_ENTRY_EXPORT          0   // Export Directory
 14 | #define IMAGE_DIRECTORY_ENTRY_IMPORT          1   // Import Directory
 15 | #define IMAGE_DIRECTORY_ENTRY_RESOURCE        2   // Resource Directory
 16 | #define IMAGE_DIRECTORY_ENTRY_EXCEPTION       3   // Exception Directory
 17 | #define IMAGE_DIRECTORY_ENTRY_SECURITY        4   // Security Directory
 18 | #define IMAGE_DIRECTORY_ENTRY_BASERELOC       5   // Base Relocation Table
 19 | #define IMAGE_DIRECTORY_ENTRY_DEBUG           6   // Debug Directory
 20 | //      IMAGE_DIRECTORY_ENTRY_COPYRIGHT       7   // (X86 usage)
 21 | #define IMAGE_DIRECTORY_ENTRY_ARCHITECTURE    7   // Architecture Specific Data
 22 | #define IMAGE_DIRECTORY_ENTRY_GLOBALPTR       8   // RVA of GP
 23 | #define IMAGE_DIRECTORY_ENTRY_TLS             9   // TLS Directory
 24 | #define IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG    10   // Load Configuration Directory
 25 | #define IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT   11   // Bound Import Directory in headers
 26 | #define IMAGE_DIRECTORY_ENTRY_IAT            12   // Import Address Table
 27 | #define IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT   13   // Delay Load Import Descriptors
 28 | #define IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR 14   // COM Runtime descriptor
 29 | 
 30 | #define IMAGE_REL_BASED_ABSOLUTE 0
 31 | #define IMAGE_REL_BASED_HIGH 1
 32 | #define IMAGE_REL_BASED_LOW 2
 33 | #define IMAGE_REL_BASED_HIGHLOW 3
 34 | #define IMAGE_REL_BASED_HIGHADJ 4
 35 | #define IMAGE_REL_BASED_MIPS_JMPADDR 5
 36 | #define IMAGE_REL_BASED_ARM_MOV32 5
 37 | #define IMAGE_REL_BASED_THUMB_MOV32 7
 38 | #define IMAGE_REL_BASED_MIPS_JMPADDR16 9
 39 | #define IMAGE_REL_BASED_IA64_IMM64 9
 40 | #define IMAGE_REL_BASED_DIR64 10
 41 | 
 42 | struct image_dos_header {
 43 |     uint16_t e_magic;
 44 |     uint16_t e_cblp;
 45 |     uint16_t e_cp;
 46 |     uint16_t e_crlc;
 47 |     uint16_t e_cparhdr;
 48 |     uint16_t e_minalloc;
 49 |     uint16_t e_maxalloc;
 50 |     uint16_t e_ss;
 51 |     uint16_t e_sp;
 52 |     uint16_t e_csum;
 53 |     uint16_t e_ip;
 54 |     uint16_t e_cs;
 55 |     uint16_t e_lfarlc;
 56 |     uint16_t e_ovno;
 57 |     uint16_t e_res[4];
 58 |     uint16_t e_oemid;
 59 |     uint16_t e_oeminfo;
 60 |     uint16_t e_res2[10];
 61 |     uint32_t e_lfanew;
 62 | };
 63 | 
 64 | struct image_file_header {
 65 |     uint16_t machine;
 66 |     uint16_t number_of_sections;
 67 |     uint32_t time_date_stamp;
 68 |     uint32_t pointer_to_symbol_table;
 69 |     uint32_t number_of_symbols;
 70 |     uint16_t size_of_optional_header;
 71 |     uint16_t characteristics;
 72 | };
 73 | 
 74 | struct image_data_directory {
 75 |     uint32_t virtual_address;
 76 |     uint32_t size;
 77 | };
 78 | 
 79 | struct image_optional_header64 {
 80 |     uint16_t magic;
 81 |     uint8_t major_linker_version;
 82 |     uint8_t minor_linker_version;
 83 |     uint32_t size_of_code;
 84 |     uint32_t size_of_initialized_data;
 85 |     uint32_t size_of_uninitialized_data;
 86 |     uint32_t address_of_entry_point;
 87 |     uint32_t base_of_code;
 88 |     uint64_t image_base;
 89 |     uint32_t section_alignment;
 90 |     uint32_t file_alignment;
 91 |     uint16_t major_operating_system_version;
 92 |     uint16_t minor_operating_system_version;
 93 |     uint16_t major_image_version;
 94 |     uint16_t minor_image_version;
 95 |     uint16_t major_subsystem_version;
 96 |     uint16_t minor_subsystem_version;
 97 |     uint32_t win_32_version_value;
 98 |     uint32_t size_of_image;
 99 |     uint32_t size_of_headers;
100 |     uint32_t check_sum;
101 |     uint16_t subsystem;
102 |     uint16_t dll_characteristics;
103 |     uint64_t size_of_stack_reserve;
104 |     uint64_t size_of_stack_commit;
105 |     uint64_t size_of_heap_reserve;
106 |     uint64_t size_of_heap_commit;
107 |     uint32_t loader_flags;
108 |     uint32_t number_of_rva_and_sizes;
109 |     struct image_data_directory data_directory[IMAGE_NUMBEROF_DIRECTORY_ENTRIES];
110 | };
111 | 
112 | struct image_nt_headers64 {
113 |     uint32_t signature;
114 |     struct image_file_header file_header;
115 |     struct image_optional_header64 optional_header;
116 | };
117 | 
118 | struct image_base_relocation {
119 |     uint32_t virtual_address;
120 |     uint32_t size_of_block;
121 | };
122 | 
123 | struct image_export_directory {
124 | 	uint32_t characteristics;
125 | 	uint32_t time_date_stamp;
126 | 	uint16_t major_version;
127 | 	uint16_t minor_version;
128 | 	uint32_t name;
129 | 	uint32_t base;
130 | 	uint32_t number_of_functions;
131 | 	uint32_t number_of_names;
132 | 	uint32_t address_of_functions;     // RVA from base of image
133 | 	uint32_t address_of_names;         // RVA from base of image
134 | 	uint32_t address_of_name_ordinals; // RVA from base of image
135 | };
136 | 
137 | #pragma pack(pop)
138 | 
139 | #endif /* NT_H */


--------------------------------------------------------------------------------
/hypervisor/memory/mem.c:
--------------------------------------------------------------------------------
  1 | #include "mem.h"
  2 | 
  3 | static void copy_physical_page(enum copy_dir dir, uintptr_t addr, void *buffer, size_t size)
  4 | {
  5 |     if (dir == COPY_READ) {
  6 |         memcpy(buffer, (const void *)addr, size);
  7 |     } else {
  8 |         memcpy((void *)addr, buffer, size);
  9 |     }
 10 | }
 11 | 
 12 | static pt_entry_64 *get_pte_from_va(cr3 table, void *va, int *level)
 13 | {
 14 |     size_t pml4_idx = ADDRMASK_PML4_INDEX(va);
 15 |     size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(va);
 16 |     size_t pde_idx = ADDRMASK_PDE_INDEX(va);
 17 |     size_t pte_idx = ADDRMASK_PTE_INDEX(va);
 18 | 
 19 |     pml4e_64 *pml4 = (pml4e_64 *)((uintptr_t)table.address_of_page_directory * PAGE_SIZE);
 20 |     pml4e_64 *pml4e = &pml4[pml4_idx];
 21 |     if (!pml4e->present) {
 22 |         *level = 4;
 23 |         return (pt_entry_64 *)pml4e;
 24 |     }
 25 | 
 26 |     pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE);
 27 |     pdpte_64 *pdpte = &pdpt[pdpte_idx];
 28 |     if (!pdpte->present || pdpte->large_page) {
 29 |         *level = 3;
 30 |         return (pt_entry_64 *)pdpte;
 31 |     }
 32 | 
 33 |     pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE);
 34 |     pde_64 *pde = &pd[pde_idx];
 35 |     if (!pde->present || pde->large_page) {
 36 |         *level = 2;
 37 |         return (pt_entry_64 *)pde;
 38 |     }
 39 | 
 40 |     pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE);
 41 |     pte_64 *pte = &pt[pte_idx];
 42 |     *level = 1;
 43 |     return (pt_entry_64 *)pte;
 44 | }
 45 | 
 46 | uintptr_t mem_va_to_pa(cr3 table, void *va)
 47 | {
 48 |     int level;
 49 |     pt_entry_64 *entry = get_pte_from_va(table, va, &level);
 50 |     if (!entry->present) {
 51 |         return 0;
 52 |     }
 53 | 
 54 |     switch (level) {
 55 |     case 4:
 56 |         die_on(true, "Invalid level of 4 retrieved for va %lX", va);
 57 |         break;
 58 |     case 3:
 59 |     {
 60 |         pdpte_1gb_64 pdpte;
 61 |         pdpte.flags = entry->flags;
 62 |         return (pdpte.page_frame_number * GiB(1)) + ADDRMASK_PDPTE_OFFSET(va);
 63 |     }
 64 |     case 2:
 65 |     {
 66 |         pde_2mb_64 pde;
 67 |         pde.flags = entry->flags;
 68 |         return (pde.page_frame_number * MiB(2)) + ADDRMASK_PDE_OFFSET(va);
 69 |     }
 70 |     case 1:
 71 |     {
 72 |         pte_64 pte;
 73 |         pte.flags = entry->flags;
 74 |         return (pte.page_frame_number * PAGE_SIZE) + ADDRMASK_PTE_OFFSET(va);
 75 |     }
 76 |     default:
 77 |         die_on(true, "Invalid pte level %d for va %lX", level, va);
 78 |         break;
 79 |     }
 80 | }
 81 | 
 82 | bool mem_copy_virt_tofrom_host(enum copy_dir dir, cr3 table,
 83 |                              uintptr_t addr, void *buffer, size_t size)
 84 | {
 85 |     die_on(!table.flags, "Invalid CR3 value");
 86 |     die_on(!addr, "Invalid virtual address");
 87 |     die_on(!buffer, "Invalid host buffer");
 88 |     die_on(!size, "Invalid size");
 89 | 
 90 |     bool result = true;
 91 | 
 92 |     /*
 93 |      * As buffer referenced in the virtual address may not have a contiguous
 94 |      * physical address for each page, we need to retrieve and copy pages individually.
 95 |      */
 96 |     while (size) {
 97 |         /* Calculate how many bytes need to be copied for this page. */
 98 |         size_t page_offset = ADDRMASK_PTE_OFFSET(addr);
 99 |         size_t copy_this_page = PAGE_SIZE - page_offset;
100 |         size_t bytes_to_copy = (copy_this_page < size) ? copy_this_page : size;
101 | 
102 |         /* Get this physical address of this page. */
103 |         uintptr_t phys_addr = mem_va_to_pa(table, (void *)addr);
104 |         if (!phys_addr) {
105 |             result = false;
106 |             break;
107 |         }
108 | 
109 |         /* Do the operation for copying the page. */
110 |         copy_physical_page(dir, phys_addr, buffer, bytes_to_copy);
111 | 
112 |         /* Update counters for next page. */
113 |         addr += bytes_to_copy;
114 |         buffer = (void *)((uintptr_t)buffer + bytes_to_copy);
115 |         size -= bytes_to_copy;
116 |     }
117 | 
118 |     return result;
119 | }
120 | 
121 | bool mem_copy_virt_to_virt(cr3 src_cr3, void *src, cr3 dest_cr3, void *dest, size_t size)
122 | {
123 |     die_on(!src_cr3.flags, "Invalid source CR3 value");
124 |     die_on(!src, "Invalid source value");
125 |     die_on(!dest_cr3.flags, "Invalid dest CR3 value");
126 |     die_on(!dest, "Invalid destination value");
127 |     die_on(!size, "Invalid size specified");
128 | 
129 |     bool result = true;
130 | 
131 |     /* Re-read our guest -> host copy, pretty much identical. */
132 |     uintptr_t virt_src = (uintptr_t)src;
133 |     uintptr_t virt_dest = (uintptr_t)dest;
134 |     while (size) {
135 |         /* Calculate how many bytes we can do from first page. */
136 |         size_t src_page_offset = ADDRMASK_PTE_OFFSET(virt_src);
137 |         size_t src_page_bytes = PAGE_SIZE - src_page_offset;
138 | 
139 |         size_t dest_page_offset = ADDRMASK_PTE_OFFSET(virt_dest);
140 |         size_t dest_page_bytes = PAGE_SIZE - dest_page_offset;
141 | 
142 |         /* Make sure we are not overlapping a copy from dest or source.
143 |          * Also make sure we're not copying more than what is available left to copy. */
144 |         size_t copy_bytes = (src_page_bytes < dest_page_bytes) ? src_page_bytes : dest_page_bytes;
145 |         copy_bytes = (copy_bytes < size) ? copy_bytes : size;
146 | 
147 |         /* Get the physical addresses for src & dest.
148 |          * Since we map all physical memory into VMROOT, just copy like normal. */
149 |         uint8_t *phys_src = (uint8_t *)mem_va_to_pa(src_cr3, (void *)virt_src);
150 |         uint8_t *phys_dest = (uint8_t *)mem_va_to_pa(dest_cr3, (void *)virt_dest);
151 |         if (!phys_src || !phys_dest) {
152 |             result = false;
153 |             break;
154 |         }
155 | 
156 |         memcpy(&phys_dest[dest_page_offset], &phys_src[src_page_offset], copy_bytes);
157 |         virt_src += copy_bytes;
158 |         virt_dest += copy_bytes;
159 |         size -= copy_bytes;
160 |     }
161 | 
162 |     return result;
163 | }
164 | 


--------------------------------------------------------------------------------
/hypervisor/vmm/vmcall.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "platform/intrin.h"
  4 | #include "memory/mem.h"
  5 | #include "memory/vmem.h"
  6 | #include "handler.h"
  7 | #include "vmcall.h"
  8 | #include "vmcall_if.h"
  9 | #include "vmm_common.h"
 10 | 
 11 | struct vmcall_handler {
 12 |     /* Linked list pointer. */
 13 |     struct vmcall_handler *next;
 14 |     /* The action ID for identification. */
 15 |     vmcall_id_t id;
 16 |     /* The callback for the vmcall. */
 17 |     vmcall_cbk_t callback;
 18 |     /* Callback specific data. */
 19 |     void *opaque;
 20 | };
 21 | 
 22 | struct vmcall_ctx {
 23 |     /* Hold a linked list of vmcall handlers. */
 24 |     struct vmcall_handler *handlers;
 25 |     /* Back reference to the VMM context */
 26 |     struct vmm_ctx *vmm;
 27 | };
 28 | 
 29 | static struct vmcall_handler *find_handler(struct vmcall_ctx *ctx, vmcall_id_t id)
 30 | {
 31 |     struct vmcall_handler *curr = ctx->handlers;
 32 |     while (curr) {
 33 |         if (curr->id == id)
 34 |             return curr;
 35 | 
 36 |         curr = curr->next;
 37 |     }
 38 |     return NULL;
 39 | }
 40 | 
 41 | static vmcall_status_t handle_check_presence(uint8_t *buffer, void *opaque)
 42 | {
 43 |     (void)buffer;
 44 |     (void)opaque;
 45 |     DEBUG_PRINT("Guest checked presence.");
 46 |     return VMCALL_STATUS_OK;
 47 | }
 48 | 
 49 | static void vmcall_exit_handle(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
 50 | {
 51 |     static const exception_error_code DEFAULT_EC = { 0 };
 52 | 
 53 |     /*
 54 |      * Handled VMCALL's from the guest.
 55 |      * calling convention for the VMCALL interface is as follows:
 56 |      *
 57 |      * RCX = SECRET_KEY
 58 |      * RDX = (struct vmcall_param *) - guest pointer
 59 |      * 
 60 |      * On return:
 61 |      * RAX = vmcall status
 62 |      *
 63 |      * If RCX is not equal to the secret key, no action taken.
 64 |      * If present, attempt to read the action parameter and parse.
 65 |      */
 66 |     size_t secret_key = vcpu->guest_context.rcx;
 67 |     uintptr_t guest_param = vcpu->guest_context.rdx;
 68 | 
 69 |     if (secret_key != VMCALL_SECRET_KEY) {
 70 |         *move_to_next = false;
 71 |         vmm_inject_guest_event(invalid_opcode, DEFAULT_EC);
 72 |         return;
 73 |     }
 74 | 
 75 |     /* Ensure we actually have a parameter specified. */
 76 |     vmcall_status_t status;
 77 |     if (!guest_param) {
 78 |         status = VMCALL_STATUS_INVALID_PARAM;
 79 |         goto tidyup;
 80 |     }
 81 | 
 82 |     /* Copy the parameter from the guest into host context. */
 83 |     cr3 guest_cr3;
 84 |     guest_cr3.flags = __vmread(VMCS_GUEST_CR3);
 85 |     die_on(!guest_cr3.flags, "Guest CR3 value cannot be retrieved.");
 86 | 
 87 |     struct vmcall_param host_param = { 0 };
 88 |     if (!mem_copy_virt_tofrom_host(COPY_READ, guest_cr3, guest_param,
 89 |                                    &host_param, sizeof(host_param))) {
 90 |         status = VMCALL_STATUS_INVALID_PARAM;
 91 |         goto tidyup;
 92 |     }
 93 | 
 94 |     /* Now lets actually do the VMCALL callback handling. */
 95 |     struct vmcall_ctx *ctx = (struct vmcall_ctx *)opaque;
 96 | 
 97 |     spin_lock(&ctx->vmm->lock);
 98 | 
 99 |     /* Find the handler relevant for the VMCALL. */
100 |     struct vmcall_handler *handler = find_handler(ctx, host_param.id);
101 |     if (!handler) {
102 |         status = VMCALL_STATUS_INVALID_ID;
103 |         goto tidyup_locked;
104 |     }
105 | 
106 |     /* Call the handler and store status. */
107 |     status = handler->callback(host_param.buffer, handler->opaque);
108 |     DEBUG_PRINT("VMCALL callback id=%ld status=%ld", host_param.id, status);
109 | 
110 |     /* Copy the modified host parameter back to guest memory. */
111 |     if (!mem_copy_virt_tofrom_host(COPY_WRITE, guest_cr3, guest_param,
112 |                                    &host_param, sizeof(host_param))) {
113 |         status = VMCALL_STATUS_INTERNAL_ERROR;
114 |     }
115 | 
116 |     /* Now store success status. */
117 | tidyup_locked:
118 |     spin_unlock(&ctx->vmm->lock);
119 | tidyup:
120 |     vcpu->guest_context.rax = status;
121 |     *move_to_next = true;
122 | }
123 | 
124 | struct vmcall_ctx *vmcall_init(struct vmm_ctx *vmm)
125 | {
126 |     /* Allocate our context structure for VMCALL handling. */
127 |     struct vmcall_ctx *ctx = vmem_alloc(sizeof(struct vmcall_ctx), MEM_WRITE);
128 |     die_on(!ctx, "Unable to allocate context for VMCALL handlers.");
129 |     vmm->vmcall = ctx;
130 |     ctx->vmm = vmm;
131 | 
132 |     /* Register a VMEXIT reason handler so we can catch & parse VMCALLs. */
133 |     handler_register_exit(vmm->handler, VMX_EXIT_REASON_VMCALL, vmcall_exit_handle, ctx, false);
134 | 
135 |     /* Register our generic VMCALL events. */
136 |     vmcall_register_action(ctx, VMCALL_ACTION_CHECK_PRESENCE, handle_check_presence, ctx);
137 | 
138 |     return ctx;
139 | }
140 | 
141 | void vmcall_register_action(struct vmcall_ctx *ctx,
142 |                             vmcall_id_t id,
143 |                             vmcall_cbk_t callback,
144 |                             void *opaque)
145 | {
146 |     /* Ensure synchronization. */
147 |     spin_lock(&ctx->vmm->lock);
148 | 
149 |     /* Ensure there isn't already a VMCALL handler registered with same ID. */
150 |     die_on(find_handler(ctx, id),
151 |            "Handler already existing for VMCALL id=%ld", id);
152 | 
153 |     /* Allocate a new handler structure. */
154 |     struct vmcall_handler *new_handler = vmem_alloc(sizeof(struct vmcall_handler), MEM_WRITE);
155 |     die_on(!new_handler, "Unable to allocate memory for VMCALL handler.");
156 | 
157 |     new_handler->id = id;
158 |     new_handler->callback = callback;
159 |     new_handler->opaque = opaque;
160 | 
161 |     /* Now add it to the head of our handler list. */
162 |     if (!ctx->handlers)
163 |         ctx->handlers = new_handler;
164 |     else {
165 |         new_handler->next = ctx->handlers;
166 |         ctx->handlers = new_handler;
167 |     }
168 | 
169 |     DEBUG_PRINT("VMCALL registered for id %ld cbk 0x%lX opaque 0x%lX",
170 |                 id, callback, opaque);
171 |     spin_unlock(&ctx->vmm->lock);
172 | }


--------------------------------------------------------------------------------
/hypervisor/platform/intrin.asm:
--------------------------------------------------------------------------------
  1 | section .text
  2 | 
  3 | global __readcs
  4 | global __readcr0
  5 | global __readcr2
  6 | global __readcr3
  7 | global __readcr4
  8 | global __readdr7
  9 | global __rdtsc
 10 | global __writecr0
 11 | global __writecr3
 12 | global __writecr4
 13 | global __lidt
 14 | global __sidt
 15 | global __lgdt
 16 | global __sgdt
 17 | global __lldt
 18 | global __sldt
 19 | global __str
 20 | global __ltr
 21 | global __xsetbv
 22 | global __invd
 23 | global __invlpg
 24 | global __invept
 25 | global __vmxon
 26 | global __vmclear
 27 | global __vmptrld
 28 | global __vmwrite
 29 | global __vmread
 30 | global __vmlaunch
 31 | global __vmresume
 32 | global __capture_context
 33 | global __restore_context
 34 | 
 35 | __readcs:
 36 |     mov ax, cs
 37 |     ret
 38 | 
 39 | __readcr0:
 40 |     mov rax, cr0
 41 |     ret
 42 | 
 43 | __readcr2:
 44 |     mov rax, cr2
 45 |     ret
 46 | 
 47 | __readcr3:
 48 |     mov rax, cr3
 49 |     ret
 50 | 
 51 | __readcr4:
 52 |     mov rax, cr4
 53 |     ret
 54 | 
 55 | __readdr7:
 56 |     mov rax, dr7
 57 |     ret
 58 | 
 59 | __rdtsc:
 60 |     rdtsc
 61 |     shl rdx, 32
 62 |     or rax, rdx
 63 |     ret
 64 | 
 65 | __writecr0:
 66 |     mov cr0, rcx
 67 |     ret
 68 | 
 69 | __writecr3:
 70 |     mov cr3, rcx
 71 |     ret
 72 | 
 73 | __writecr4:
 74 |     mov cr4, rcx
 75 |     ret
 76 | 
 77 | __lidt:
 78 |     lidt [rcx]
 79 |     ret
 80 | 
 81 | __sidt:
 82 |     sidt [rcx]
 83 |     ret
 84 | 
 85 | __lgdt:
 86 |     lgdt [rcx]
 87 |     ret
 88 | 
 89 | __sgdt:
 90 |     sgdt [rcx]
 91 |     ret
 92 | 
 93 | __lldt:
 94 |     lldt [rcx]
 95 |     ret
 96 | 
 97 | __sldt:
 98 |     sldt [rcx]
 99 |     ret
100 | 
101 | __str:
102 |     str word [rcx]
103 |     ret
104 | 
105 | __ltr:
106 |     ltr word [rcx]
107 |     ret
108 | 
109 | __xsetbv:
110 |     ; assume RCX already contains operand1
111 |     ; move operand2 from RDX into EDX:EAX
112 |     mov eax, edx
113 |     shr rdx, 32
114 |     xsetbv
115 |     ret
116 | 
117 | __invd:
118 |     invd
119 |     ret
120 | 
121 | __invlpg:
122 |     invlpg [rcx]
123 |     ret
124 | 
125 | __invept:
126 |     invept rcx, [rdx]
127 |     ret
128 | 
129 | __vmxon:
130 |     vmxon [rcx]
131 |     ret
132 | 
133 | __vmclear:
134 |     vmclear [rcx]
135 |     ret
136 | 
137 | __vmptrld:
138 |     vmptrld [rcx]
139 |     ret
140 | 
141 | __vmwrite:
142 |     vmwrite rcx, rdx
143 |     ret
144 | 
145 | __vmread:
146 |     vmread rax, rcx
147 |     ret
148 | 
149 | __vmlaunch:
150 |     vmlaunch
151 |     ret
152 | 
153 | __vmresume:
154 |     vmresume
155 |     ; No need to RET as context RIP/RSP will change to VMCS_GUEST_R*P
156 | 
157 | __capture_context:
158 |     ; Push RFLAGS onto the stack
159 |     pushfq
160 | 
161 |     ; Low GP registers
162 |     mov [rcx+078h], rax
163 |     mov [rcx+080h], rcx
164 |     mov [rcx+088h], rdx
165 |     mov [rcx+0B8h], r8
166 |     mov [rcx+0C0h], r9
167 |     mov [rcx+0C8h], r10
168 |     mov [rcx+0D0h], r11
169 | 
170 |     ; Low XMM Registers
171 |     movaps [rcx+01A0h], xmm0
172 |     movaps [rcx+01B0h], xmm1
173 |     movaps [rcx+01C0h], xmm2
174 |     movaps [rcx+01D0h], xmm3
175 |     movaps [rcx+01E0h], xmm4
176 |     movaps [rcx+01F0h], xmm5
177 | 
178 |     ; Segment selectors
179 |     mov word [rcx+038h], cs
180 |     mov word [rcx+03Ah], ds
181 |     mov word [rcx+03Ch], es
182 |     mov word [rcx+042h], ss
183 |     mov word [rcx+03Eh], fs
184 |     mov word [rcx+040h], gs
185 | 
186 |     ; High GP registers
187 |     mov [rcx+090h], rbx
188 |     mov [rcx+0A0h], rbp
189 |     mov [rcx+0A8h], rsi
190 |     mov [rcx+0B0h], rdi
191 |     mov [rcx+0D8h], r12
192 |     mov [rcx+0E0h], r13
193 |     mov [rcx+0E8h], r14
194 |     mov [rcx+0F0h], r15
195 | 
196 |     ; FPU Control Word
197 |     fnstcw word [rcx+0100h]
198 |     mov dword [rcx+0102h], 0
199 | 
200 |     ; High XMM Registers
201 |     movaps [rcx+0200h], xmm6
202 |     movaps [rcx+0210h], xmm7
203 |     movaps [rcx+0220h], xmm8
204 |     movaps [rcx+0230h], xmm9
205 |     movaps [rcx+0240h], xmm10
206 |     movaps [rcx+0250h], xmm11
207 |     movaps [rcx+0260h], xmm12
208 |     movaps [rcx+0270h], xmm13
209 |     movaps [rcx+0280h], xmm14
210 |     movaps [rcx+0290h], xmm15
211 | 
212 |     ; XMM control/status register
213 |     stmxcsr dword [rcx+0118h]
214 |     stmxcsr dword [rcx+034h]
215 | 
216 |     ; Fix context RSP values
217 |     lea rax, [rsp+010h]
218 |     mov [rcx+098h], rax
219 |     mov rax, [rsp+08h]
220 |     mov [rcx+0F8h], rax
221 |     mov eax, [rsp]
222 |     mov [rcx+044h], eax
223 | 
224 |     mov dword [rcx+030h], 10000Fh
225 | 
226 |     ; Return
227 |     add rsp, 8
228 |     ret
229 | 
230 | __restore_context:
231 |     movaps  xmm0, [rcx+01A0h]   ;
232 |     movaps  xmm1, [rcx+01B0h]   ;
233 |     movaps  xmm2, [rcx+01C0h]   ;
234 |     movaps  xmm3, [rcx+01D0h]   ;
235 |     movaps  xmm4, [rcx+01E0h]   ;
236 |     movaps  xmm5, [rcx+01F0h]   ;
237 |     movaps  xmm6, [rcx+0200h]   ; Restore all XMM registers
238 |     movaps  xmm7, [rcx+0210h]   ;
239 |     movaps  xmm8, [rcx+0220h]   ;
240 |     movaps  xmm9, [rcx+0230h]   ;
241 |     movaps  xmm10, [rcx+0240h]  ;
242 |     movaps  xmm11, [rcx+0250h]  ;
243 |     movaps  xmm12, [rcx+0260h]  ;
244 |     movaps  xmm13, [rcx+0270h]  ;
245 |     movaps  xmm14, [rcx+0280h]  ;
246 |     movaps  xmm15, [rcx+0290h]  ;
247 |     ldmxcsr [rcx+034h]          ;
248 | 
249 |     mov     rax, [rcx+078h]     ;
250 |     mov     rdx, [rcx+088h]     ;
251 |     mov     r8,  [rcx+0B8h]     ; Restore volatile registers
252 |     mov     r9,  [rcx+0C0h]     ;
253 |     mov     r10, [rcx+0C8h]     ;
254 |     mov     r11, [rcx+0D0h]     ;
255 | 
256 |     mov     rbx, [rcx+090h]     ;
257 |     mov     rsi, [rcx+0A8h]     ;
258 |     mov     rdi, [rcx+0B0h]     ;
259 |     mov     rbp, [rcx+0A0h]     ; Restore non volatile regsiters
260 |     mov     r12, [rcx+0D8h]     ;
261 |     mov     r13, [rcx+0E0h]     ;
262 |     mov     r14, [rcx+0E8h]     ;
263 |     mov     r15, [rcx+0F0h]     ;
264 | 
265 |     cli                         ; Disable interrupts
266 |     push    qword [rcx+044h]    ; Push RFLAGS on stack
267 |     popfq                       ; Restore RFLAGS
268 |     mov     rsp, [rcx+098h]     ; Restore old stack
269 |     push    qword [rcx+0F8h]    ; Push RIP on old stack
270 |     mov     rcx, [rcx+080h]     ; Restore RCX since we spilled it
271 |     ret                         ; Restore RIP


--------------------------------------------------------------------------------
/hypervisor/vmm/vmm_common.h:
--------------------------------------------------------------------------------
  1 | #ifndef VMM_COMMON_H
  2 | #define VMM_COMMON_H
  3 | 
  4 | #include <linux/types.h>
  5 | #include "platform/intrin.h"
  6 | #include "platform/util.h"
  7 | #include "vmm_reg.h"
  8 | #include "ia32_compact.h"
  9 | 
 10 | /* At max, support up to 100 vCPUs. */
 11 | #define VCPU_MAX 100
 12 | 
 13 | /* Defines the size of the host stack. */
 14 | #define HOST_STACK_SIZE 0x6000
 15 | 
 16 | struct vmm_init_params {
 17 |     __attribute__((aligned(0x10))) cr3 guest_cr3;
 18 |     __attribute__((aligned(0x10))) cr3 host_cr3;
 19 |     __attribute__((aligned(0x10))) segment_descriptor_register_64 guest_idtr;
 20 |     __attribute__((aligned(0x10))) segment_descriptor_register_64 host_idtr;
 21 |     uintptr_t image_base;
 22 |     size_t image_size;
 23 | };
 24 | 
 25 | /*
 26 |  * Holds whether a vCPU currently has a cached interrupt
 27 |  * to deliver to the guest.
 28 |  *
 29 |  * No synchronisation method is needed for this as
 30 |  * set/get of this structure will happen within same
 31 |  * vCPU.
 32 |  *
 33 |  * TODO: HOWEVER we should probably account for multiple
 34 |  * interrupts happening within same VMEXIT frame eventually.
 35 |  */
 36 | struct cached_interrupt {
 37 |     exception_vector vector;
 38 |     exception_error_code code;
 39 |     bool pending;
 40 | };
 41 | 
 42 | /* Holds the global context for the VMM. */
 43 | struct vmm_ctx {
 44 |     struct vmm_init_params init;
 45 |     struct ept_ctx *ept;
 46 |     struct handler_ctx *handler;
 47 |     struct vmcall_ctx *vmcall;
 48 |     spinlock_t lock;
 49 | };
 50 | 
 51 | /* Holds the context specific to a singular vCPU. */
 52 | struct vcpu_ctx {
 53 |     __attribute__ ((aligned (PAGE_SIZE))) uint8_t host_stack[HOST_STACK_SIZE];
 54 |     __attribute__ ((aligned (PAGE_SIZE))) vmxon host_vmxon;
 55 |     __attribute__ ((aligned (PAGE_SIZE))) vmcs guest_vmcs;
 56 |     __attribute__ ((aligned (PAGE_SIZE))) uint8_t msr_trap_bitmap[PAGE_SIZE];
 57 | 
 58 |     struct vcpu_context hyperjack_context;
 59 |     struct control_registers guest_ctrl_regs;
 60 |     struct vcpu_context guest_context;
 61 |     struct gdt_config gdt_cfg;
 62 |     struct cached_interrupt cached_int;
 63 | 
 64 |     struct vmm_ctx *vmm;
 65 |     struct nested_ctx *nested;
 66 |     size_t idx;
 67 | 
 68 |     bool launched;
 69 |     size_t last_ignored_msr;
 70 | };
 71 | 
 72 | static inline struct vcpu_ctx *vmm_get_vcpu_ctx(void)
 73 | {
 74 |     /* 
 75 |      * Dirty hack, as GS_BASE is actually unused on x86_64
 76 |      * we can use this field in the host context to store/retrieve
 77 |      * the vCPU context pointer.
 78 |      */
 79 |     struct vcpu_ctx *vcpu = (struct vcpu_ctx *)rdmsr(IA32_GS_BASE);
 80 |     die_on(!vcpu, "vCPU context not correct.");
 81 |     return vcpu;
 82 | }
 83 | 
 84 | static inline size_t vmm_read_gp_register(struct vcpu_ctx *vcpu, uint64_t base_reg)
 85 | {
 86 |     assert(base_reg < 16);
 87 | 
 88 |     size_t reg_val;
 89 |     if (base_reg == 4 /* RSP is 4th in array context. */)
 90 |         reg_val = __vmread(VMCS_GUEST_RSP);
 91 |     else {
 92 |         uint64_t *gp_arr = &vcpu->guest_context.rax;
 93 |         reg_val = gp_arr[base_reg];
 94 |     }
 95 |     return reg_val;
 96 | }
 97 | 
 98 | static inline size_t vmm_read_seg_register(struct vcpu_ctx *vcpu, uint64_t seg_index)
 99 | {
100 |     assert(seg_index < 6);
101 | 
102 |     switch (seg_index) {
103 |     case 0:
104 |         return vcpu->guest_context.seg_es;
105 |     case 1:
106 |         return vcpu->guest_context.seg_cs;
107 |     case 2:
108 |         return vcpu->guest_context.seg_ss;
109 |     case 3:
110 |         return vcpu->guest_context.seg_ds;
111 |     case 4:
112 |         return vcpu->guest_context.seg_fs;
113 |     case 5:
114 |         return vcpu->guest_context.seg_gs;
115 |     }
116 | }
117 | 
118 | static inline void vmm_set_cached_interrupt(exception_vector vector, exception_error_code code)
119 | {
120 |     struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx();
121 | 
122 |     vcpu->cached_int.vector = vector;
123 |     vcpu->cached_int.code = code;
124 |     vcpu->cached_int.pending = true;
125 | }
126 | 
127 | static inline void vmm_msr_trap_enable(uint8_t *bitmap, size_t msr, bool trap)
128 | {
129 |     static const size_t LOW_START = 0x0;
130 |     static const size_t LOW_END = 0x1fff;
131 |     static const size_t HIGH_START = 0xc0000000;
132 |     static const size_t HIGH_END = 0xc0001fff;
133 | 
134 |     uint8_t *read_low = &bitmap[0];
135 |     uint8_t *read_high = &bitmap[1024];
136 |     uint8_t *write_low = &bitmap[2048];
137 |     uint8_t *write_high = &bitmap[3072];
138 | 
139 |     if ((msr >= LOW_START) && (msr <= LOW_END)) {
140 |         if (trap) {
141 |             bitmap_set_bit(read_low, msr);
142 |             bitmap_set_bit(write_low, msr);
143 |         } else {
144 |             bitmap_clear_bit(read_low, msr);
145 |             bitmap_clear_bit(write_low, msr);
146 |         }
147 |     } else if ((msr >= HIGH_START) && (msr <= HIGH_END)) {
148 |         size_t offset = msr - HIGH_START;
149 |         if (trap) {
150 |             bitmap_set_bit(read_high, offset);
151 |             bitmap_set_bit(write_high, offset);
152 |         } else {
153 |             bitmap_clear_bit(read_high, offset);
154 |             bitmap_clear_bit(write_high, offset);
155 |         }
156 |     } else {
157 |         die_on(false, "MSR 0x%lX out of valid range", msr);
158 |     }
159 | }
160 | 
161 | static inline uint8_t vmm_virt_addr_bits()
162 | {
163 | 	return __vmread(VMCS_HOST_CR4) & CR4_LA57_MASK ? 57 : 48;
164 | }
165 | 
166 | static inline uint64_t get_canonical(uint64_t la, uint8_t vaddr_bits)
167 | {
168 | 	return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
169 | }
170 | 
171 | static inline bool is_noncanonical_address(uint64_t la)
172 | {
173 | 	return get_canonical(la, vmm_virt_addr_bits()) != la;
174 | }
175 | 
176 | static inline cr0 vmm_adjust_cr0(cr0 old_cr0)
177 | {
178 |     cr0 fixed0, fixed1;
179 |     cr0 new_cr0 = old_cr0;
180 |     fixed0.flags = rdmsr(IA32_VMX_CR0_FIXED0);
181 |     fixed1.flags = rdmsr(IA32_VMX_CR0_FIXED1);
182 | 
183 |     new_cr0.flags &= fixed1.flags;
184 |     new_cr0.flags |= fixed0.flags;
185 | 
186 |     DEBUG_PRINT("old_cr0=0x%lX new_cr0=0x%lX fixed0=0x%lX fixed1=0x%lX",
187 |                 old_cr0.flags, new_cr0.flags, fixed0.flags, fixed1.flags);
188 | 
189 |     return new_cr0;
190 | }
191 | 
192 | static inline cr4 vmm_adjust_cr4(cr4 old_cr4)
193 | {
194 |     cr4 fixed0, fixed1;
195 |     cr4 new_cr4 = old_cr4;
196 |     fixed0.flags = rdmsr(IA32_VMX_CR4_FIXED0);
197 |     fixed1.flags = rdmsr(IA32_VMX_CR4_FIXED1);
198 | 
199 |     new_cr4.flags &= fixed1.flags;
200 |     new_cr4.flags |= fixed0.flags;
201 | 
202 |     DEBUG_PRINT("old_cr4=0x%lX new_cr4=0x%lX fixed0=0x%lX fixed1=0x%lX",
203 |                 old_cr4.flags, new_cr4.flags, fixed0.flags, fixed1.flags);
204 | 
205 |     return new_cr4;
206 | }
207 | 
208 | void vmm_inject_guest_event(exception_vector vector, exception_error_code code);
209 | 
210 | #endif /* VMM_COMMON_H */


--------------------------------------------------------------------------------
/hypervisor/memory/pmem.c:
--------------------------------------------------------------------------------
  1 | //#define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "platform/spinlock.h"
  4 | #include "pmem.h"
  5 | 
  6 | /*
  7 |  * Standalone physical memory manager.
  8 |  * This module can be used for allocating pages of physical memory within
  9 |  * the host system. The lowest granularity supported is the size of a 4K page
 10 |  * due to page table restrictions (x86_64) as this is used with things such as
 11 |  * EPT where 4k pages are needed this is no problem (and no need to create a
 12 |  * heap allocator then).
 13 |  * 
 14 |  * The MM does not use 0xE820 or the EFI equivalent as the goal of this system
 15 |  * is a bit different. Instead we use a reserved PMEM range that is effectively
 16 |  * a uint8_t array within the .data? section of the application. The reasoning
 17 |  * behind this is that in the future when we look at implementing EPT and potentially
 18 |  * hiding the EFI driver from the guest OS running on the system all allocated/used
 19 |  * memory is actually within actual image so we don't have to traverse or keep
 20 |  * a record of memory allocated elsewhere.
 21 |  */
 22 | 
 23 | /* Defines the size of physical memory that can be used. */
 24 | #define PMEM_SIZE MiB(20)
 25 | #define PMEM_PAGE_COUNT (PMEM_SIZE / PAGE_SIZE)
 26 | 
 27 | #define PAGE_COUNT(byte_count) ((byte_count + PAGE_SIZE - 1) / PAGE_SIZE)
 28 | #define SET_N_BITS(n) ((1 << n) - 1)
 29 | 
 30 | /*
 31 |  * This *SHOULD* reside in the .bss section
 32 |  * not affecting actual PE size, however will be allocated at load
 33 |  * time of the image. 
 34 |  */
 35 | static uint8_t __attribute__ ((aligned (PAGE_SIZE))) pmem_region[PMEM_SIZE] = { 0 };
 36 | 
 37 | /*
 38 |  * A bitmap will be used for storing which pages are used/free
 39 |  * Each bit represents a page. We will also store the index of
 40 |  * the last page allocated, this is to speed up allocation
 41 |  * when contiguous pages need to be allocated (however this
 42 |  * will not help for when we overflow or memory is freed).
 43 |  * 
 44 |  * TODO: Make this better.....
 45 |  */
 46 | static size_t pmem_bitmap[PMEM_PAGE_COUNT / NUMBER_BITS_TYPE(size_t)];
 47 | static size_t pmem_last_chunk_idx;
 48 | static size_t pmem_last_bit_idx;
 49 | 
 50 | static spinlock_t lock;
 51 | static size_t total_allocated = 0;
 52 | 
 53 | static inline bool find_contiguous_unset(size_t count,
 54 |                                          size_t *found_chunk_idx,
 55 |                                          size_t *found_bit_idx)
 56 | {
 57 |     size_t chunk_idx = pmem_last_chunk_idx;
 58 |     size_t bit_idx = pmem_last_bit_idx;
 59 | 
 60 |     do {
 61 | 
 62 |         size_t curr_count = 0;
 63 |         while (1) {
 64 | 
 65 |             /* Check to see if current bit is unsed. */
 66 |             if (!((pmem_bitmap[chunk_idx] >> bit_idx) & 1))
 67 |                 curr_count++;
 68 |             else
 69 |                 curr_count = 0;
 70 | 
 71 |             /* If we have found N contiguous bits return value. */
 72 |             if (curr_count == count) {
 73 |                 *found_chunk_idx = chunk_idx;
 74 |                 *found_bit_idx = bit_idx;
 75 |                 return true;
 76 |             }
 77 | 
 78 |             /* Increment the chunk and bit indexes. */
 79 |             bit_idx = (bit_idx + 1) % NUMBER_BITS_TYPE(pmem_bitmap[0]);
 80 |             if (!bit_idx)
 81 |                 chunk_idx = (chunk_idx + 1) % ARRAY_SIZE(pmem_bitmap);
 82 | 
 83 |             /*
 84 |              * If chunk index has iterated to beginning of bitmap
 85 |              * we must reset current count as overflow cannot count
 86 |              * as contiguous.
 87 |              */
 88 |             if (!chunk_idx && !bit_idx)
 89 |                 curr_count = 0;
 90 |         }
 91 |     } while ((chunk_idx != pmem_last_chunk_idx) && (bit_idx != pmem_last_bit_idx));
 92 | 
 93 |     return false;
 94 | }
 95 | 
 96 | static inline void set_contiguous_bits(size_t chunk_idx, size_t bit_idx, size_t count, bool val)
 97 | {
 98 |     for (size_t i = 0; i < count; i++) {
 99 |         die_on(((pmem_bitmap[chunk_idx] >> bit_idx) & 1) == val,
100 |                "Bit already set, chunk %d bit %d",
101 |                chunk_idx, bit_idx);
102 | 
103 |         if (val)
104 |             pmem_bitmap[chunk_idx] |= 1ull << bit_idx;
105 |         else
106 |             pmem_bitmap[chunk_idx] &= 1ull << bit_idx;
107 | 
108 |         bit_idx = (bit_idx + 1) % NUMBER_BITS_TYPE(pmem_bitmap[0]);
109 |         if (!bit_idx)
110 |             chunk_idx = (chunk_idx + 1) % ARRAY_SIZE(pmem_bitmap);
111 |     }
112 | }
113 | 
114 | void pmem_init(void)
115 | {
116 |     /* Lets ensure everything is cleared/zero'd. */
117 |     pmem_last_chunk_idx = 0;
118 |     pmem_last_bit_idx = 0;
119 |     memset(pmem_bitmap, 0, sizeof(pmem_bitmap));
120 |     memset(pmem_region, 0, sizeof(pmem_region));
121 | 
122 |     spin_init(&lock);
123 | }
124 | 
125 | uintptr_t pmem_alloc_page(void)
126 | {
127 |     spin_lock(&lock);
128 | 
129 |     /* Search for 1 contiguous page in the bitmap that is unset. */
130 |     size_t chunk_idx;
131 |     size_t bit_idx;
132 |     if (find_contiguous_unset(1, &chunk_idx, &bit_idx)) {
133 |         DEBUG_PRINT("Found chunk %d bit %d", chunk_idx, bit_idx);
134 | 
135 |         /* Indicate that bit & page is now in use. */
136 |         set_contiguous_bits(chunk_idx, bit_idx, 1, true);
137 | 
138 |         /* Update the last index stored to help speed up future allocations. */
139 |         pmem_last_chunk_idx = chunk_idx;
140 |         pmem_last_bit_idx = bit_idx;
141 | 
142 |         /* Calculate & return the allocated page's address. */
143 |         size_t offset = (chunk_idx * NUMBER_BITS_TYPE(pmem_bitmap[0])) + bit_idx;
144 |         offset *= PAGE_SIZE;
145 | 
146 |         uint8_t *result = &pmem_region[offset];
147 |         memset(result, 0, PAGE_SIZE);
148 |         spin_unlock(&lock);
149 |         return (uintptr_t)result;
150 |     }
151 | 
152 |     spin_unlock(&lock);
153 |     return 0;
154 | }
155 | 
156 | uintptr_t pmem_alloc_contiguous(size_t bytes)
157 | {
158 |     die_on(!bytes, "Invalid parameter unable to allocate 0 bytes.");
159 |     die_on(bytes > (NUMBER_BITS_TYPE(pmem_bitmap[0]) * PAGE_SIZE),
160 |            "Current pmem allocator cannot allocate enough pages to fit %d bytes",
161 |            bytes);
162 | 
163 |     const size_t number_pages = PAGE_COUNT(bytes);
164 | 
165 |     spin_lock(&lock);
166 | 
167 |     size_t chunk_idx;
168 |     size_t bit_idx;
169 |     if (find_contiguous_unset(number_pages, &chunk_idx, &bit_idx)) {
170 |         DEBUG_PRINT("Found chunk %d bit %d", chunk_idx, bit_idx);
171 | 
172 |         /* Indicate the following bits are used. */
173 |         set_contiguous_bits(chunk_idx, bit_idx, number_pages, true);
174 | 
175 |         /* Update the last index stored to help speed up future allocations. */
176 |         pmem_last_chunk_idx = chunk_idx;
177 |         pmem_last_bit_idx = bit_idx;
178 | 
179 |         /* Calculate the physical address of the buffer. */
180 |         size_t offset_chunk = (NUMBER_BITS_TYPE(pmem_bitmap[0]) * chunk_idx) * PAGE_SIZE;
181 |         size_t offset_bit = bit_idx * PAGE_SIZE;
182 | 
183 |         uint8_t *result = &pmem_region[offset_chunk + offset_bit];
184 |         memset(result, 0, bytes);
185 |         spin_unlock(&lock);
186 |         total_allocated += PAGE_COUNT(bytes) * PAGE_SIZE;
187 |         return (uintptr_t)result;
188 |     }
189 | 
190 |     spin_unlock(&lock);
191 |     return 0;
192 | }
193 | 
194 | void pmem_free_page(uintptr_t page)
195 | {
196 |     /* Assert that the memory to free is within our allocator range.
197 |      * and that it is page aligned. */
198 |     assert(page >= (uintptr_t)pmem_region);
199 |     assert(page < (uintptr_t)pmem_region + sizeof(pmem_region));
200 |     assert(!(page & (PAGE_SIZE - 1)));
201 | 
202 |     spin_lock(&lock);
203 | 
204 |     /* Calculate the page index in the bitmap. */
205 |     size_t offset = page - (uintptr_t)&pmem_region[0];
206 |     size_t full_bit_index = offset / PAGE_SIZE;
207 | 
208 |     size_t chunk_idx = full_bit_index / NUMBER_BITS_TYPE(pmem_bitmap[0]);
209 |     size_t bit_idx = full_bit_index % NUMBER_BITS_TYPE(pmem_bitmap[0]);
210 | 
211 |     set_contiguous_bits(chunk_idx, bit_idx, 1, false);
212 |     spin_unlock(&lock);
213 | }


--------------------------------------------------------------------------------
/hypervisor/memory/vmem.c:
--------------------------------------------------------------------------------
  1 | //#define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "platform/spinlock.h"
  4 | #include "platform/intrin.h"
  5 | #include "memory/pmem.h"
  6 | #include "memory/vmem.h"
  7 | #include "memory/mem.h"
  8 | 
  9 | /* Standalone virtual memory manager.
 10 |  * This module created the page tables required to implement virtual memory
 11 |  * which will be used the by HOST/ROOT of the hypervisor.
 12 |  * When booted from an EFI environment normally memory is identity mapped
 13 |  * with a 1:1 of physical to virtual memory. We are going to do the same
 14 |  * for here.
 15 |  * 
 16 |  * However, physical memory (allocated via the pmem module) can be allocated
 17 |  * for use (so contiguous pages can exist) into virtual memory.
 18 |  * The first 512GB of vmem will be identity mapped, and then the second 512GB (PML4 idx 1)
 19 |  * address starting at 0x8000000000 of virtual space will be used for memory allocated
 20 |  * within here.
 21 |  * 
 22 |  * This allows for an easy seperation and determination of identity mapped vs allocated
 23 |  * memory.
 24 |  */
 25 | 
 26 | #define ENTRIES_PER_TABLE 512
 27 | #define DYN_VMEM_START GiB(512)
 28 | 
 29 | struct vmem_ctx {
 30 |     /* Describes the 512 contiguous 512GB memory regions. */
 31 |     __attribute__ ((aligned (PAGE_SIZE))) pml4e_64 pml4[ENTRIES_PER_TABLE];
 32 | 
 33 |     /* Describes the first 512 1GB memory regions within PML4[0] used for identity mapping.
 34 |      * These will be set as large pages.
 35 |      * So that we don't need to go to lower granularity (2MB or 4k). */
 36 |     __attribute__ ((aligned (PAGE_SIZE))) pdpte_1gb_64 identity_pdpt[ENTRIES_PER_TABLE];
 37 | 
 38 |     /*
 39 |      * pml4e_64
 40 |      * --- pdpte_64
 41 |      * ------ pde_64
 42 |      * ---------pte_64
 43 |      */
 44 | 
 45 |     uintptr_t next_free_addr;
 46 |     spinlock_t sync;
 47 | };
 48 | 
 49 | static struct vmem_ctx *m_ctx = NULL;
 50 | 
 51 | static void init_identity_table(struct vmem_ctx *ctx)
 52 | {
 53 |     /* Set out the first PML4E to indicate this is present
 54 |      * and this is what we'll be using for identity mapping. */
 55 |     ctx->pml4[0].present = true;
 56 |     ctx->pml4[0].write = true;
 57 |     ctx->pml4[0].page_frame_number = ((uintptr_t)ctx->identity_pdpt) / PAGE_SIZE;
 58 | 
 59 |     for (size_t i = 0; i < 512; i++) {
 60 |         ctx->identity_pdpt[i].present = true;
 61 |         ctx->identity_pdpt[i].write = true;
 62 |         ctx->identity_pdpt[i].execute_disable = false;
 63 |         ctx->identity_pdpt[i].large_page = true;
 64 |         ctx->identity_pdpt[i].page_frame_number = i;
 65 |     }
 66 | }
 67 | 
 68 | static void create_table_entries(uintptr_t addr, bool write, bool exec)
 69 | {
 70 |     DEBUG_PRINT("Creating page tables for address %lX write %d exec %d", addr, write, exec);
 71 | 
 72 |     size_t pml4_idx = ADDRMASK_PML4_INDEX(addr);
 73 |     size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(addr);
 74 |     size_t pde_idx = ADDRMASK_PDE_INDEX(addr);
 75 |     size_t pte_idx = ADDRMASK_PTE_INDEX(addr);
 76 |     DEBUG_PRINT("PML4[%d] PDPTE[%d] PDE[%d] PTE[%d]", pml4_idx, pdpte_idx, pde_idx, pte_idx);
 77 | 
 78 |     pml4e_64 *pml4e = &m_ctx->pml4[pml4_idx];
 79 |     DEBUG_PRINT("--- PML4E PFN[ADDR] %lX[%lX]", (uintptr_t)pml4e / PAGE_SIZE, pml4e);
 80 | 
 81 |     if (!pml4e->present) {
 82 |         pml4e->write = true;
 83 |         pml4e->page_frame_number = pmem_alloc_page() / PAGE_SIZE;
 84 |         die_on(!pml4e->page_frame_number, "Could not allocate PML4E for addr %lX", addr);
 85 |         pml4e->present = true;
 86 |     }
 87 | 
 88 |     pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE);
 89 |     pdpte_64 *pdpte = &pdpt[pdpte_idx];
 90 |     DEBUG_PRINT("--- PDPTE PFN[ADDR] %lX[%lX]", (uintptr_t)pdpte / PAGE_SIZE, pdpte);
 91 | 
 92 |     if (!pdpte->present) {
 93 |         pdpte->write = true;
 94 |         pdpte->page_frame_number = pmem_alloc_page() / PAGE_SIZE;
 95 |         die_on(!pdpte->page_frame_number, "Could not allocate PDPTE for addr %lX", addr);
 96 |         pdpte->present = true;
 97 |     }
 98 | 
 99 |     pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE);
100 |     pde_64 *pde = &pd[pde_idx];
101 |     DEBUG_PRINT("--- PDE PFN[ADDR] %lX[%lX]", (uintptr_t)pde / PAGE_SIZE, pde);
102 | 
103 |     if (!pde->present) {
104 |         pde->write = true;
105 |         pde->page_frame_number = pmem_alloc_page() / PAGE_SIZE;
106 |         die_on(!pde->page_frame_number, "Could not allocate PDE for addr %lX", addr);
107 |         pde->present = true;
108 |     }
109 | 
110 |     pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE);
111 |     pte_64 *pte = &pt[pte_idx];
112 |     DEBUG_PRINT("--- PTE PFN[ADDR] %lX[%lX]", (uintptr_t)pte / PAGE_SIZE, pte);
113 | 
114 |     die_on(pte->present, "PTE is already present for addr %lX", addr);
115 |     pte->write = write;
116 |     pte->execute_disable = !exec;
117 |     pte->page_frame_number = pmem_alloc_page() / PAGE_SIZE;
118 |     die_on(!pte->page_frame_number, "Could not allocate PTE for addr %lX", addr);
119 |     pte->present = true;
120 |     DEBUG_PRINT("--- Allocated page memory PFN[ADDR] %lX[%lX]",
121 |                pte->page_frame_number, pte->page_frame_number * PAGE_SIZE);
122 | 
123 |     /* Invalidate the TLB for address. */
124 |     __invlpg(&addr);
125 | 
126 | #ifdef DEBUG_MODULE
127 |     /* Some debug code that ensures that the created PA matches if we
128 |      * traverse the VA back to PA. */
129 |     cr3 tmp_cr3;
130 |     tmp_cr3.flags = __readcr3();
131 |     uintptr_t actual_pa = pte->page_frame_number * PAGE_SIZE;
132 |     uintptr_t calc_pa = mem_va_to_pa(tmp_cr3, (void *)addr);
133 |     die_on(actual_pa != calc_pa,
134 |            "Physical addr %lX does not match calculated physical addr %lX",
135 |            actual_pa, calc_pa);
136 | #endif
137 | }
138 | 
139 | static void modify_entry_perms(uintptr_t addr, bool write, bool exec)
140 | {
141 |     DEBUG_PRINT("Modifying page table entries for address %lX write %d exec %d", addr, write, exec);
142 | 
143 |     size_t pml4_idx = ADDRMASK_PML4_INDEX(addr);
144 |     size_t pdpte_idx = ADDRMASK_PDPTE_INDEX(addr);
145 |     size_t pde_idx = ADDRMASK_PDE_INDEX(addr);
146 |     size_t pte_idx = ADDRMASK_PTE_INDEX(addr);
147 |     DEBUG_PRINT("PML4[%d] PDPTE[%d] PDE[%d] PTE[%d]", pml4_idx, pdpte_idx, pde_idx, pte_idx);
148 | 
149 |     pml4e_64 *pml4e = &m_ctx->pml4[pml4_idx];
150 |     die_on(!pml4e->present, "PML4E for addr %lX not present", addr);
151 | 
152 |     pdpte_64 *pdpt = (pdpte_64 *)((uintptr_t)pml4e->page_frame_number * PAGE_SIZE);
153 |     pdpte_64 *pdpte = &pdpt[pdpte_idx];
154 |     die_on(!pdpte->present, "PDPTE for addr %lX not present", addr);
155 | 
156 |     pde_64 *pd = (pde_64 *)((uintptr_t)pdpte->page_frame_number * PAGE_SIZE);
157 |     pde_64 *pde = &pd[pde_idx];
158 | 
159 |     die_on(!pde->present, "PDE for addr %lX not present", addr);
160 | 
161 |     pte_64 *pt = (pte_64 *)((uintptr_t)pde->page_frame_number * PAGE_SIZE);
162 |     pte_64 *pte = &pt[pte_idx];
163 |     die_on(!pte->present, "PTE for addr %lX not present", addr);
164 |     pte->write = write;
165 |     pte->execute_disable = !exec;
166 | 
167 |     /* Invalidate the TLB for address. */
168 |     __invlpg(&addr);
169 | }
170 | 
171 | void vmem_init(cr3 *original_cr3, cr3 *new_cr3)
172 | {
173 |     /* Store the original CR3 value before initialising the virtual-memory manager. */
174 |     original_cr3->flags = __readcr3();
175 |     DEBUG_PRINT("Storing original CR3 %lX", original_cr3->flags);
176 | 
177 |     /*
178 |      * Allocated a page for the vmem context.
179 |      * Unfortunately as we are the virtual memory manager we cannot
180 |      * allocate virtual memory to do this (for obvious reasons)
181 |      * so we will allocate via pmem (it will be identity mapped either way)
182 |      */
183 |     m_ctx = (struct vmem_ctx *)pmem_alloc_contiguous(sizeof(struct vmem_ctx));
184 |     die_on(!m_ctx, "Unable to allocate contact for virtual memory manager.");
185 | 
186 |     /* Clear main root PML4. */
187 |     memset(m_ctx->pml4, 0, sizeof(m_ctx->pml4));
188 | 
189 |     /* Initialise the page table with identity mapping for the environment. */
190 |     init_identity_table(m_ctx);
191 | 
192 |     /* Set the next free address in the dynamic allocator. */
193 |     m_ctx->next_free_addr = DYN_VMEM_START;
194 | 
195 |     spin_init(&m_ctx->sync);
196 | 
197 |     /* Write the new CR3 value so that the memory manager is used. */
198 |     new_cr3->page_level_cache_disable = original_cr3->page_level_cache_disable;
199 |     new_cr3->page_level_write_through = original_cr3->page_level_write_through;
200 |     new_cr3->address_of_page_directory = ((uintptr_t)m_ctx->pml4) / PAGE_SIZE;
201 |     __writecr3(new_cr3->flags);
202 |     DEBUG_PRINT("New CR3 value loaded %lX", new_cr3->flags);
203 | 
204 |     /* Ensure everything in EFER is correct. */
205 |     ia32_efer_register efer = { 0 };
206 |     efer.flags = rdmsr(IA32_EFER);
207 |     die_on(!efer.ia32e_mode_active, "ia32e_mode not active");
208 |     die_on(!efer.ia32e_mode_enable, "ia32e_mode not enabled");
209 |     if (!efer.execute_disable_bit_enable) {
210 |         DEBUG_PRINT("EFER.NX not enabled, setting now.");
211 |         efer.execute_disable_bit_enable = true;
212 |         wrmsr(IA32_EFER, efer.flags);
213 |     }
214 | }
215 | 
216 | void *vmem_alloc(size_t size, unsigned int flags)
217 | {
218 |     /* 
219 |      * For each of the pages for the address specified from our next
220 |      * free start address create a table entry.
221 |      */
222 |     spin_lock(&m_ctx->sync);
223 | 
224 |     DEBUG_PRINT("Unaligned size: %ld", size);
225 |     /* Align size to next largest page. */
226 |     size = (size & PAGE_MASK) ? ((size + PAGE_SIZE) & ~PAGE_MASK) : size;
227 |     DEBUG_PRINT("Attempting allocation of size: %ld", size);
228 | 
229 |     /* Determine info from flags. */
230 |     bool write = (flags & MEM_WRITE) != 0;
231 |     bool exec = (flags & MEM_EXECUTE) != 0;
232 | 
233 |     uintptr_t start_addr = m_ctx->next_free_addr;
234 |     uintptr_t end_addr = start_addr + size;
235 |     DEBUG_PRINT("Start addr 0x%lX end addr 0x%lX diff 0x%lX", start_addr, end_addr, end_addr - start_addr);
236 | 
237 |     for (uintptr_t curr_addr = start_addr;
238 |          curr_addr < end_addr;
239 |          curr_addr += PAGE_SIZE) {
240 |         
241 |         /* Allocate the pages tables for the address needed. */
242 |         create_table_entries(curr_addr, write, exec);
243 |     }
244 | 
245 |     m_ctx->next_free_addr = end_addr;
246 |     die_on(m_ctx->next_free_addr < DYN_VMEM_START,
247 |            "The virtual memory manager's next_free_addr has iterated back into the" \
248 |            "identity mapped area, we should probably create an algorithm to reuse" \
249 |            "freed memory ranges.");
250 | 
251 |     spin_unlock(&m_ctx->sync);
252 | 
253 |     return (void *)start_addr;
254 | }
255 | 
256 | void vmem_change_perms(void *addr, size_t size, unsigned int flags)
257 | {
258 |     spin_lock(&m_ctx->sync);
259 | 
260 |     /* Determine info from flags. */
261 |     bool write = (flags & MEM_WRITE) != 0;
262 |     bool exec = (flags & MEM_EXECUTE) != 0;
263 | 
264 |     uintptr_t start_addr = (uintptr_t)addr;
265 |     uintptr_t end_addr = start_addr + size;
266 | 
267 |     /* Iterate each page and change permissions. */
268 |     for (uintptr_t curr_addr = start_addr;
269 |          curr_addr < end_addr;
270 |          curr_addr += PAGE_SIZE) {
271 | 
272 |         modify_entry_perms(curr_addr, write, exec);
273 |     }
274 | 
275 |     spin_unlock(&m_ctx->sync);
276 | }
277 | 
278 | void vmem_free(void *addr, size_t size)
279 | {
280 |     /* TODO: Unfortunately need to pass size here, unless
281 |      * we keep a VAD style map logging all of our allocations
282 |      * (I'd rather kill myself than add more complexity to this
283 |      * considering this is not the main goal of the project). */
284 |     die_on(true, "vmem_free not implemented as of yet.");
285 |     (void)addr;
286 |     (void)size;
287 | }


--------------------------------------------------------------------------------
/hypervisor/vmm/nested.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "memory/mem.h"
  4 | #include "memory/vmem.h"
  5 | #include "vmm.h"
  6 | #include "handler.h"
  7 | #include "vmm_common.h"
  8 | #include "nested.h"
  9 | 
 10 | #define NESTED_REVISION_ID 0x0000BEEF
 11 | 
 12 | struct nested_ctx {
 13 |     uint32_t vm_instruction_error;
 14 |     gpa_t vmxon_ptr;
 15 | };
 16 | 
 17 | static void set_vmx_success_flags()
 18 | {
 19 |     rfl rflags;
 20 |     rflags.flags = __vmread(VMCS_GUEST_RFLAGS);
 21 | 
 22 |     rflags.carry_flag = 0;
 23 |     rflags.parity_flag = 0;
 24 |     rflags.auxiliary_carry_flag = 0;
 25 |     rflags.zero_flag = 0;
 26 |     rflags.sign_flag = 0;
 27 |     rflags.overflow_flag = 0;
 28 |     __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags);
 29 | }
 30 | 
 31 | static void set_vmx_fail_invalid_flags()
 32 | {
 33 |     rfl rflags;
 34 |     rflags.flags = __vmread(VMCS_GUEST_RFLAGS);
 35 | 
 36 |     rflags.carry_flag = 1;
 37 |     rflags.parity_flag = 0;
 38 |     rflags.auxiliary_carry_flag = 0;
 39 |     rflags.zero_flag = 0;
 40 |     rflags.sign_flag = 0;
 41 |     rflags.overflow_flag = 0;
 42 |     __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags);
 43 | }
 44 | 
 45 | // static void set_vmx_fail_valid_flags(struct nested_ctx *nested, uint32_t err_no)
 46 | // {
 47 | //     rfl rflags;
 48 | //     rflags.flags = __vmread(VMCS_GUEST_RFLAGS);
 49 | 
 50 | //     rflags.carry_flag = 0;
 51 | //     rflags.parity_flag = 0;
 52 | //     rflags.auxiliary_carry_flag = 0;
 53 | //     rflags.zero_flag = 1;
 54 | //     rflags.sign_flag = 0;
 55 | //     rflags.overflow_flag = 0;
 56 | //     __vmwrite(VMCS_GUEST_RFLAGS, rflags.flags);
 57 | //     nested->vm_instruction_error = err_no;
 58 | // }
 59 | 
 60 | static bool is_nested_enabled(struct vcpu_ctx *vcpu)
 61 | {
 62 |     return (vcpu->nested != NULL);
 63 | }
 64 | 
 65 | static void enable_nested_vmx(struct vcpu_ctx *vcpu)
 66 | {
 67 |     assert(!is_nested_enabled(vcpu));
 68 | 
 69 |     vcpu->nested = vmem_alloc(sizeof(struct nested_ctx), MEM_WRITE);
 70 |     die_on(!vcpu->nested, "Unable to allocate memory for vCPU %d nested virt.", vcpu->idx);
 71 | 
 72 |     DEBUG_PRINT("Nested virtualization enabled.");
 73 | }
 74 | 
 75 | static bool is_attempt_enable_vmx(struct vcpu_ctx *vcpu,
 76 |                                   vmx_exit_qualification_cr_access qual)
 77 | {
 78 |     if (qual.cr_number != VMX_EXIT_QUALIFICATION_REGISTER_CR4)
 79 |         return false;
 80 | 
 81 |     if (qual.access_type != VMX_EXIT_QUALIFICATION_ACCESS_MOV_TO_CR)
 82 |         return false;
 83 | 
 84 |     /* For RSP register access we have to directly read guest RSP. */
 85 |     uint64_t reg_val = vmm_read_gp_register(vcpu, qual.gp_register);
 86 |     DEBUG_PRINT("Attempt to set CR4 to val 0x%lX", reg_val);
 87 | 
 88 |     /* Check to see if guest is trying to set VMXE. */
 89 |     if (!(reg_val & CR4_VMXE_MASK))
 90 |         return false;
 91 | 
 92 |     /* If nested is already enabled do nothing. */
 93 |     if (is_nested_enabled(vcpu))
 94 |         return false;
 95 | 
 96 |     enable_nested_vmx(vcpu);
 97 |     return true;
 98 | }
 99 | 
100 | static bool get_vmx_mem_address(struct vcpu_ctx *vcpu,
101 |                                 uintptr_t qual,
102 |                                 vmx_vmexit_instruction_info_vmx_and_xsaves info,
103 |                                 gpa_t *mem_address)
104 | {
105 |     gpa_t offset = qual;
106 |     DEBUG_PRINT("Qual set to offset value 0x%lX", offset);
107 | 
108 |     if (info.base_register_invalid == false) {
109 |         /* Read the base register and add to offset. */
110 |         uint64_t base_reg = vmm_read_gp_register(vcpu, info.base_register);
111 |         offset += base_reg;
112 |         DEBUG_PRINT("Adding base register %d value 0x%lX new offset 0x%lX",
113 |                     info.base_register, base_reg, offset);
114 |     }
115 | 
116 |     if (info.gp_register_invalid == false) {
117 |         /* Read the index register, and add to offset with scale. */
118 |         uint64_t index_reg = vmm_read_gp_register(vcpu, info.gp_register);
119 |         offset += (index_reg << info.scaling);
120 |         DEBUG_PRINT("Adding index register 0x%lX with scaling 0x%X new offset 0x%lX",
121 |                     index_reg, info.scaling, offset);
122 |     }
123 | 
124 |     /* Deal with the address sizing. */
125 |     if (info.address_size == 1 /* 32 bit */)
126 |         offset &= 0xFFFFFFFF;
127 |     else if (info.address_size == 0 /* 16 bit */)
128 |         offset &= 0xFFFF;
129 | 
130 |     /*
131 |      * Check to see if guest in long mode,
132 |      * if not we throw a fault as we do not support anything else.
133 |      */
134 |     ia32_efer_register efer;
135 |     efer.flags = __vmread(VMCS_GUEST_EFER);
136 |     die_on(!efer.ia32e_mode_enable, "Nested virt not supported when not in long mode");
137 | 
138 |     /* If GS or FS set, add these to the offset. */
139 |     if (info.segment_register == SEG_FS)
140 |         offset += __vmread(VMCS_GUEST_FS_BASE);
141 |     else if (info.segment_register == SEG_GS)
142 |         offset += __vmread(VMCS_GUEST_GS_BASE);
143 | 
144 |     /* Ensure address falls in range of canonical addresses. */
145 |     if (is_noncanonical_address(offset)) {
146 |         DEBUG_PRINT("Non canonical address specified 0x%lX", offset);
147 |         static const exception_error_code DEFAULT_EC = { 0 };
148 |         vmm_inject_guest_event(info.segment_register == SEG_SS ?
149 |                                stack_segment_fault : general_protection,
150 |                                DEFAULT_EC);
151 |         return false;
152 |     } else {
153 |         *mem_address = offset;
154 |         return true;
155 |     }
156 | }
157 | 
158 | static bool get_vmptr(struct vcpu_ctx *vcpu, gpa_t *vmptr)
159 | {
160 |     /* Attempt to read the VMXON/VMCS region for the host. */
161 |     vmx_vmexit_instruction_info_vmx_and_xsaves info;
162 |     info.flags = __vmread(VMCS_EXIT_INSTR_INFO);
163 | 
164 |     uintptr_t qual = __vmread(VMCS_EXIT_QUALIFICATION);
165 |     DEBUG_PRINT("VMXON instruction info 0x%lX qual 0x%lX", info.flags, qual);
166 | 
167 |     /* Get the address of the VMXON pointer in guest virtual memory. */
168 |     uintptr_t guest_addr;
169 |     if (!get_vmx_mem_address(vcpu, qual, info, &guest_addr))
170 |         return false;
171 | 
172 |     cr3 guest_cr3;
173 |     guest_cr3.flags = __vmread(VMCS_GUEST_CR3);
174 |     if (!mem_copy_virt_tofrom_host(COPY_READ, guest_cr3, guest_addr, vmptr, sizeof(*vmptr))) {
175 |         DEBUG_PRINT("Unable to read guest memory for VMXON pointer");
176 |         return false;
177 |     }
178 | 
179 |     DEBUG_PRINT("VMX guest mem address 0x%lX vmptr 0x%lX", guest_addr, *vmptr);
180 |     return true;
181 | }
182 | 
183 | static bool nested_rdmsr(struct vcpu_ctx *vcpu, size_t msr, size_t *value)
184 | {
185 |     (void)vcpu;
186 | 
187 |     switch (msr) {
188 |     case IA32_VMX_BASIC:
189 |         ia32_vmx_basic_register basic = { 0 };
190 |         basic.vmcs_revision_id = NESTED_REVISION_ID;
191 |         basic.vmcs_size_in_bytes = PAGE_SIZE;
192 |         basic.memory_type = MEMORY_TYPE_WB;
193 |         basic.ins_outs_vmexit_information = true;
194 |         basic.true_controls = true;
195 |         *value = basic.flags;
196 |         break;
197 |     default:
198 |         return false;
199 |     }
200 | 
201 |     return true;
202 | }
203 | 
204 | static bool nested_wrmsr(struct vcpu_ctx *vcpu, size_t msr, size_t value)
205 | {
206 |     (void)vcpu;
207 |     (void)msr;
208 |     (void)value;
209 |     return false;
210 | }
211 | 
212 | static bool nested_mov_crx(struct vcpu_ctx *vcpu, bool *move_to_next)
213 | {
214 |     (void)vcpu;
215 | 
216 |     /* Verify to see whether the MOV CRX is relevant to nested. */
217 |     vmx_exit_qualification_cr_access qual;
218 |     qual.flags = __vmread(VMCS_EXIT_QUALIFICATION);
219 |     DEBUG_PRINT("Exit qualification 0x%lX", qual.flags);
220 | 
221 |     if (is_attempt_enable_vmx(vcpu, qual)) {
222 |         *move_to_next = true;
223 |         return true;
224 |     }
225 | 
226 |     *move_to_next = false;
227 |     return false;
228 | }
229 | 
230 | static bool nested_vmxon(struct vcpu_ctx *vcpu, bool *move_to_next)
231 | {
232 |     static const exception_error_code DEFAULT_EC = { 0 };
233 | 
234 |     /* Verify that VMXE is enabled for the guest. */
235 |     if (!(__vmread(VMCS_GUEST_CR4) & CR4_VMXE_MASK)) {
236 |         vmm_inject_guest_event(invalid_opcode, DEFAULT_EC);
237 |         *move_to_next = false;
238 |         return true;
239 |     }
240 | 
241 |     /* Verify that we are CPL 0. */
242 |     segment_selector cs;
243 |     cs.flags = __vmread(VMCS_GUEST_CS_SEL);
244 |     if (cs.request_privilege_level != 0) {
245 |         vmm_inject_guest_event(general_protection, DEFAULT_EC);
246 |         *move_to_next = false;
247 |         return true;
248 |     }
249 | 
250 |     /* Attempt to get the VMXON pointer. */
251 |     gpa_t vmptr;
252 |     if (!get_vmptr(vcpu, &vmptr)) {
253 |         *move_to_next = false;
254 |         return true;
255 |     }
256 | 
257 |     if (!vmptr) {
258 |         DEBUG_PRINT("Unlikely null vmptr region, indicating failure.");
259 |         set_vmx_fail_invalid_flags();
260 |         *move_to_next = true;
261 |         return true;
262 |     }
263 | 
264 |     /* Ensure that the guest VMXON pointer is page aligned. */
265 |     if (vmptr & PAGE_MASK) {
266 |         DEBUG_PRINT("Non page aligned vmxon region defined 0x%lX", vmptr);
267 |         set_vmx_fail_invalid_flags();
268 |         *move_to_next = true;
269 |         return true;
270 |     }
271 | 
272 |     /* Verify VMXON VMCS revision ID matches. */
273 |     uint32_t revision = *(uint32_t *)vmptr;
274 | 
275 |     /*
276 |      * vmptr is a physical address of the guest, as we're identity mapped this is
277 |      * mapped 1:1 into our virtual address space. No need to do conversion from
278 |      * virtual guest to physical host.
279 |      */
280 |     if (revision != NESTED_REVISION_ID) {
281 |         DEBUG_PRINT("Guest specified revision 0x%X does not match host supported 0x%X",
282 |                     revision, NESTED_REVISION_ID);
283 |         set_vmx_fail_invalid_flags();
284 |         *move_to_next = true;
285 |         return true;
286 |     } else {
287 |         DEBUG_PRINT("Guest vmcs revision 0x%X", revision);
288 |     }
289 | 
290 |     /* TODO: Allocate the vmxon fields for nested support. */
291 |     vcpu->nested->vmxon_ptr = vmptr;
292 |     set_vmx_success_flags();
293 | 
294 |     *move_to_next = true;
295 |     return true;
296 | }
297 | 
298 | static void handle_rdmsr(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
299 | {
300 |     static const exception_error_code DEFAULT_EC = { 0 };
301 | 
302 |     (void)opaque;
303 | 
304 |     size_t msr = (uint32_t)vcpu->guest_context.rcx;
305 | 
306 |     /* Check to see if valid RPL to perform the read. */
307 |     segment_selector cs;
308 |     cs.flags = __vmread(VMCS_GUEST_CS_SEL);
309 |     if (cs.request_privilege_level != 0) {
310 |         DEBUG_PRINT("RDMSR 0x%lX wrong RPL 0x%X", msr, cs.request_privilege_level);
311 |         vmm_inject_guest_event(general_protection, DEFAULT_EC);
312 |         *move_to_next = false;
313 |         return;
314 |     }
315 | 
316 |     /* Check to see if within valid MSR range. */
317 |     if ((msr && (msr <= 0x1FFF)) || ((msr >= 0xC0000000) && (msr <= 0xC0001FFF))) {
318 | 
319 |         /* Attempt to offload to nested handler. */
320 |         size_t value;
321 |         if (!nested_rdmsr(vcpu, msr, &value))
322 |             die_on(true, "Unhandled MSR read 0x%lX at rip 0x%lX", msr, vcpu->guest_context.rip);
323 | 
324 |         vcpu->guest_context.rax = (uint32_t)value;
325 |         vcpu->guest_context.rdx = value >> 32;
326 |         DEBUG_PRINT("Guest read MSR 0x%lX spoofed value 0x%lX real 0x%lX",
327 |                     msr, value, rdmsr(msr));
328 | 
329 |         *move_to_next = true;
330 |         return;
331 |     }
332 | 
333 |     /* Invalid MSR which is out of range. */
334 |     DEBUG_PRINT("RDMSR 0x%lX out of range", msr);
335 |     vmm_inject_guest_event(general_protection, DEFAULT_EC);
336 |     *move_to_next = false;
337 | }
338 | 
339 | static void handle_wrmsr(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
340 | {
341 |     static const exception_error_code DEFAULT_EC = { 0 };
342 | 
343 |     (void)opaque;
344 | 
345 |     size_t msr_id = (uint32_t)vcpu->guest_context.rcx;
346 |     size_t msr_val = (vcpu->guest_context.rdx << 32) | (uint32_t)vcpu->guest_context.rax;
347 | 
348 |     /* Check to see if valid RPL to perform the read. */
349 |     segment_selector cs;
350 |     cs.flags = __vmread(VMCS_GUEST_CS_SEL);
351 |     if (cs.request_privilege_level != 0) {
352 |         DEBUG_PRINT("WRMSR 0x%lX 0x%lX wrong RPL 0x%X", msr_id, msr_val, cs.request_privilege_level);
353 |         vmm_inject_guest_event(general_protection, DEFAULT_EC);
354 |         *move_to_next = false;
355 |         return;
356 |     }
357 | 
358 |     /* Check to see if within valid MSR range. */
359 |     if ((msr_id && (msr_id <= 0x1FFF)) || ((msr_id >= 0xC0000000) && (msr_id <= 0xC0001FFF))) {
360 | 
361 |         /* Attempt to offload to nested handler. */
362 |         if (!nested_wrmsr(vcpu, msr_id, msr_val))
363 |             die_on(true, "Unhandled MSR write 0x%lX value at rip 0x%lX",
364 |                    msr_id, msr_val, vcpu->guest_context.rip);
365 | 
366 |         DEBUG_PRINT("Guest write MSR 0x%lX value 0x%lX", msr_id, msr_val);
367 | 
368 |         *move_to_next = true;
369 |         return;
370 |     }
371 | 
372 |     /* Invalid MSR which is out of range. */
373 |     DEBUG_PRINT("WRMSR 0x%lX 0x%lX out of range", msr_id, msr_val);
374 |     vmm_inject_guest_event(general_protection, DEFAULT_EC);
375 |     *move_to_next = false;
376 |     return;
377 | }
378 | 
379 | static void handle_mov_crx(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
380 | {
381 |     (void)opaque;
382 |     die_on(!nested_mov_crx(vcpu, move_to_next), "Unable to handle MOV_CRX");
383 | }
384 | 
385 | static void handle_vmxon(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
386 | {
387 |     (void)opaque;
388 |     die_on(!nested_vmxon(vcpu, move_to_next), "Unable to handle VMXON");
389 | }
390 | 
391 | void nested_init(struct vmm_ctx *vmm)
392 | {
393 |     handler_register_exit(vmm->handler, VMX_EXIT_REASON_RDMSR, handle_rdmsr, NULL, false);
394 |     handler_register_exit(vmm->handler, VMX_EXIT_REASON_WRMSR, handle_wrmsr, NULL, false);
395 |     handler_register_exit(vmm->handler, VMX_EXIT_REASON_MOV_CRX, handle_mov_crx, NULL, false);
396 |     handler_register_exit(vmm->handler, VMX_EXIT_REASON_VMXON, handle_vmxon, NULL, false);
397 | }
398 | 
399 | void nested_init_vcpu(struct vcpu_ctx *vcpu)
400 | {
401 |     /*
402 |      * Adjust the MSR bitmap to indicate which nested VMX related
403 |      * MSRs we need to trap on.
404 |      */
405 |     vmm_msr_trap_enable(vcpu->msr_trap_bitmap, IA32_VMX_BASIC, true);
406 | }


--------------------------------------------------------------------------------
/hypervisor/vmm/ept.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include <errno.h>
  3 | #include "platform/standard.h"
  4 | #include "platform/intrin.h"
  5 | #include "memory/vmem.h"
  6 | #include "memory/mem.h"
  7 | #include "ept.h"
  8 | #include "ia32_compact.h"
  9 | 
 10 | #define ENTRIES_PER_TABLE 512
 11 | 
 12 | struct mtrr_data {
 13 |     bool valid;
 14 |     bool is_fixed;
 15 |     uint8_t type;
 16 |     size_t phys_base_min;
 17 |     size_t phys_base_max;
 18 | };
 19 | 
 20 | struct ept_ctx {
 21 |     /* Describes 512 contiguous 512GiB memory regions. */
 22 |     __attribute__((aligned(PAGE_SIZE))) epml4e pml4[ENTRIES_PER_TABLE];
 23 |     /* Describes exactly 512 contiguous 1GiB memory regions with a singular PML4 region. */
 24 |     __attribute__((aligned(PAGE_SIZE))) ept_pdpte pml3[ENTRIES_PER_TABLE];
 25 |     /* For each 1GB PML3 entry, create 512 2MB regions.
 26 |      * We are using 2MB pages as the smallest paging size in the map so that we do not need
 27 |      * to allocate individual 4096 PML1 paging structures. */
 28 |     __attribute__((aligned(PAGE_SIZE))) ept_pde_2mb pml2[ENTRIES_PER_TABLE][ENTRIES_PER_TABLE];
 29 | 
 30 |     /* The EPT pointer for this context. */
 31 |     eptp ept_ptr;
 32 | 
 33 |     /* List of MTRR data, not all may be valid as each processor arch
 34 |      * can vary depending on the amount of MTRRs implemented. */
 35 |     struct mtrr_data mtrr[IA32_MTRR_COUNT];
 36 |     size_t mtrr_count;
 37 |     uint8_t def_mem_type;
 38 | };
 39 | 
 40 | struct ept_split_page {
 41 |     /* The PML1E/EPT_PTE table. */
 42 |     __attribute__((aligned(PAGE_SIZE))) ept_pte pml1[ENTRIES_PER_TABLE];
 43 | 
 44 |     /* A back reference to the PML2 entry which this was created for. */
 45 |     ept_pde_2mb *pml2e_ref;
 46 | };
 47 | 
 48 | static void gather_fixed_mtrr(struct ept_ctx *ctx)
 49 | {
 50 |     struct fixed_mtrr_info {
 51 |         uint64_t msr_id;
 52 |         uintptr_t base_address;
 53 |         size_t managed_size;
 54 |     };
 55 | 
 56 |     typedef union {
 57 |         struct {
 58 |             uint8_t types[8];
 59 |         } u;
 60 |         uint64_t flags;
 61 |     } ia32_mtrr_fixed_range_msr;
 62 | 
 63 |     static const struct fixed_mtrr_info FIXED_INFO[] = {
 64 |         { IA32_MTRR_FIX64K_00000, 0x0, 0x10000, },
 65 |         { IA32_MTRR_FIX16K_80000, 0x80000, 0x4000, },
 66 |         { IA32_MTRR_FIX16K_A0000, 0xA0000, 0x4000, },
 67 |         { IA32_MTRR_FIX4K_C0000,  0xC0000, 0x1000, },
 68 |         { IA32_MTRR_FIX4K_C8000,  0xC8000, 0x1000, },
 69 |         { IA32_MTRR_FIX4K_D0000,  0xD0000, 0x1000, },
 70 |         { IA32_MTRR_FIX4K_D8000,  0xD8000, 0x1000, },
 71 |         { IA32_MTRR_FIX4K_E0000,  0xE0000, 0x1000, },
 72 |         { IA32_MTRR_FIX4K_E8000,  0xE8000, 0x1000, },
 73 |         { IA32_MTRR_FIX4K_F0000,  0xF0000, 0x1000, },
 74 |         { IA32_MTRR_FIX4K_F8000,  0xF8000, 0x1000, },
 75 |     };
 76 | 
 77 |     ia32_mtrrcap_register caps = { .flags = rdmsr(IA32_MTRRCAP) };
 78 |     ia32_mtrr_def_type_register def_type = { .flags = rdmsr(IA32_MTRR_DEF_TYPE) };
 79 | 
 80 |     /* Store the default memory type, for all regions not covered by a MTRR. */
 81 |     ctx->def_mem_type = def_type.default_memory_type;
 82 | 
 83 |     if (!caps.fixed_range_registers_supported || !def_type.fixed_range_mtrr_enable)
 84 |         return;
 85 | 
 86 |     struct mtrr_data *last_valid = NULL;
 87 | 
 88 |     for (size_t i = 0; i < ARRAY_SIZE(FIXED_INFO); i++) {
 89 |         const struct fixed_mtrr_info *curr_range = &FIXED_INFO[i];
 90 | 
 91 |         ia32_mtrr_fixed_range_msr fixed_range = {.flags = rdmsr(curr_range->msr_id) };
 92 | 
 93 |         for (size_t j = 0; j < ARRAY_SIZE(fixed_range.u.types); j++) {
 94 |             uint8_t mem_type = fixed_range.u.types[j];
 95 |             uint64_t range_begin = curr_range->base_address + (curr_range->managed_size * j);
 96 |             uint64_t range_end = range_begin + curr_range->managed_size;
 97 | 
 98 |             /*
 99 |              * Check to see if we can combine it with previous.
100 |              * For this to be true, it must be of the same memory type
101 |              * and also be contiguous.
102 |              *
103 |              * This will make it easier & quicker for when we do searching.
104 |              */
105 |             if (last_valid && (last_valid->type == mem_type) &&
106 |                 (last_valid->phys_base_max == range_begin)) {
107 | 
108 |                 last_valid->phys_base_max += curr_range->managed_size;
109 |                 // DEBUG_PRINT("Extended last fixed entry to phys_based_min=0x%lX phys_base_max=0x%lX",
110 |                 //             last_valid->phys_base_min, last_valid->phys_base_max);
111 |             } else {
112 |                 struct mtrr_data *new_entry = &ctx->mtrr[ctx->mtrr_count];
113 |                 new_entry->valid = true;
114 |                 new_entry->is_fixed = true;
115 |                 new_entry->phys_base_min = range_begin;
116 |                 new_entry->phys_base_max = range_end;
117 |                 new_entry->type = mem_type;
118 | 
119 |                 // DEBUG_PRINT("Adding fixed entry phys_base_min=0x%lX phys_base_max=0x%lX type=0x%lX",
120 |                 //             new_entry->phys_base_min, new_entry->phys_base_max, new_entry->type);
121 | 
122 |                 last_valid = new_entry;
123 |                 ctx->mtrr_count++;
124 |             }
125 |         }
126 |     }
127 | }
128 | 
129 | static void gather_variable_mtrr(struct ept_ctx *ctx)
130 | {
131 |     ia32_mtrrcap_register caps = { .flags = rdmsr(IA32_MTRRCAP) };
132 |     ia32_mtrr_physbase_register base;
133 |     ia32_mtrr_physmask_register mask;
134 | 
135 | 
136 |     struct mtrr_data *last_valid = NULL;
137 | 
138 |     for (size_t i = 0; i < caps.variable_range_registers_count; i++) {
139 |         base.flags = rdmsr(IA32_MTRR_PHYSBASE0 + (i * 2));
140 |         mask.flags = rdmsr(IA32_MTRR_PHYSMASK0 + (i * 2));
141 | 
142 |         /* If the mtrr is valid, calculate the min and maximum ranges. */
143 |         if (mask.valid) {
144 | 
145 |             /*
146 |              * __builtin_ffsll returns 1 + the index of the least significate 1-bit of x.
147 |              * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
148 |              */
149 |             long long bit_idx = __builtin_ffsll(mask.physical_addres_mask) - 1;
150 |             size_t size_in_pages = (1ull << bit_idx);
151 | 
152 |             uint8_t mem_type = base.type;
153 |             uint64_t range_begin = base.physical_addres_base * PAGE_SIZE;
154 |             uint64_t range_size = size_in_pages * PAGE_SIZE;
155 |             uint64_t range_end = range_begin + range_size;
156 | 
157 |             if (last_valid && (last_valid->type == mem_type) &&
158 |                 (last_valid->phys_base_max == range_begin)) {
159 | 
160 |                 last_valid->phys_base_max += range_size;
161 |                 // DEBUG_PRINT("Extended last variable entry to phys_base_min=0x%lX phys_base_max=0x%lX",
162 |                 //             last_valid->phys_base_min, last_valid->phys_base_max);
163 |             } else {
164 |                 struct mtrr_data *new_entry = &ctx->mtrr[ctx->mtrr_count];
165 |                 new_entry->valid = true;
166 |                 new_entry->is_fixed = false;
167 |                 new_entry->phys_base_min = range_begin;
168 |                 new_entry->phys_base_max = range_end;
169 |                 new_entry->type = mem_type;
170 | 
171 |                 // DEBUG_PRINT("Adding variable phys_base_min=0x%lX phys_base_max=0x%lX type=0x%lX",
172 |                 //             new_entry->phys_base_min, new_entry->phys_base_max, new_entry->type);
173 | 
174 |                 last_valid = new_entry;
175 |                 ctx->mtrr_count++;
176 |             }
177 |         }
178 |     }
179 | }
180 | 
181 | static void gather_mtrr_list(struct ept_ctx *ctx)
182 | {
183 |     gather_fixed_mtrr(ctx);
184 |     gather_variable_mtrr(ctx);
185 | 
186 |     /* Let's print our memory type information. */
187 |     DEBUG_PRINT("Default memory type=%d", ctx->def_mem_type);
188 |     for (size_t i = 0; i < IA32_MTRR_COUNT; i++) {
189 |         const struct mtrr_data *curr_mtrr = &ctx->mtrr[i];
190 | 
191 |         if (curr_mtrr->valid)
192 |             DEBUG_PRINT("Range begin=0x%016llX end=0x%016llX type=%d fixed=%d",
193 |                         curr_mtrr->phys_base_min, curr_mtrr->phys_base_max,
194 |                         curr_mtrr->type, curr_mtrr->is_fixed);
195 |     }
196 | }
197 | 
198 | static uint32_t calc_mem_type(struct ept_ctx *ctx, uintptr_t phys_begin, size_t phys_size)
199 | {
200 |     /*
201 |      * Iterate all of the MTRRs we have defined, and check to see if they match any of
202 |      * the range which we have specified, if so we then use that MTRR's value.
203 |      */
204 |     uint32_t mem_type = MEMORY_TYPE_INVALID;
205 | 
206 |     for (size_t i = 0; i < IA32_MTRR_COUNT; i++) {
207 |         const struct mtrr_data *curr_mtrr = &ctx->mtrr[i];
208 | 
209 |         /* Filter out invalid/empty entries. */
210 |         if (!curr_mtrr->valid)
211 |             continue;
212 | 
213 |         /* If out of range, let's skip. */
214 |         if ((phys_begin < curr_mtrr->phys_base_min) ||
215 |             (phys_begin >= curr_mtrr->phys_base_max))
216 |             continue;
217 | 
218 |         /* If the range of our phys region is larger than defined in the MTRR throw an error. */
219 |         if ((phys_begin + phys_size - 1) >= curr_mtrr->phys_base_max)
220 |             return mem_type;
221 | 
222 |         /* Fixed MTRRs take precedence over all others. */
223 |         if (curr_mtrr->is_fixed)
224 |             return curr_mtrr->type;
225 | 
226 |         /* Uncacheable takes next precedence. */
227 |         if (curr_mtrr->type == MEMORY_TYPE_UC)
228 |             return curr_mtrr->type;
229 | 
230 |         /* Writethrough always takes precedence over writeback memory. */
231 |         if (((curr_mtrr->type == MEMORY_TYPE_WT) && (mem_type == MEMORY_TYPE_WB)) ||
232 |             ((curr_mtrr->type == MEMORY_TYPE_WB) && (mem_type == MEMORY_TYPE_WT))) {
233 |             mem_type = MEMORY_TYPE_WT;
234 |         }
235 | 
236 |         /* Anything else, just set to last matched. */
237 |         mem_type = curr_mtrr->type;
238 |     }
239 | 
240 |     /* If we didn't find the value in the MTRR list then just use the default. */
241 |     if (mem_type == MEMORY_TYPE_INVALID)
242 |         mem_type = ctx->def_mem_type;
243 | 
244 |     return mem_type;
245 | }
246 | 
247 | static void ept_split_large_page(struct ept_ctx *ctx, uintptr_t phys_addr)
248 | {
249 |     /* Attempt to get the PML2E for the physical address specified. */
250 |     ept_pde_2mb *target_pml2e = get_ept_pml2e(ctx, phys_addr);
251 |     die_on(!target_pml2e, "Invalid PML2E for addr 0x%lX", phys_addr);
252 |     
253 |     /* If the large page bit isn't set, this means already split. */
254 |     if (!target_pml2e->large_page)
255 |         return;
256 | 
257 |     struct ept_split_page *new_split = vmem_alloc(sizeof(struct ept_split_page), MEM_WRITE);
258 |     die_on(!new_split, "Unable to allocate memory for split page, phys_addr 0x%lX", phys_addr);
259 | 
260 |     /* Store the back reference to the PML2E */
261 |     new_split->pml2e_ref = target_pml2e;
262 | 
263 |     /* Now create a stub/template PML1E with default params. */
264 |     ept_pte temp_pte = {
265 |         .read_access = true,
266 |         .write_access = true,
267 |         .execute_access = true,
268 |         .ignore_pat = target_pml2e->ignore_pat,
269 |         .suppress_ve = target_pml2e->suppress_ve
270 |     };
271 | 
272 |     die_on(temp_pte.memory_type == MEMORY_TYPE_INVALID,
273 |            "Memory type for 0x%lX is invalid even with splitting.", phys_addr);
274 | 
275 |     /* Calculate the physical address of the original PML2 entry.
276 |      * and the page frame number, we will use this as base. */
277 |     uintptr_t base_pml2e = target_pml2e->page_frame_number * MiB(2);
278 |     uintptr_t base_pfn = base_pml2e / PAGE_SIZE;
279 | 
280 |     /* Now fill out all the new PML1E's for the table.
281 |      * Use the template PTE for the general flags,
282 |      * but we will also need to update the PFN.
283 |      * As we have calculated the original PFN for the PML2E we can
284 |      * we can just add one page for each entry. */
285 |     for (int i = 0; i < ENTRIES_PER_TABLE; i++) {
286 |         size_t curr_pfn = base_pfn + i;
287 |         uintptr_t curr_phys = curr_pfn * PAGE_SIZE;
288 | 
289 |         new_split->pml1[i] = temp_pte;
290 |         new_split->pml1[i].memory_type = calc_mem_type(ctx, curr_phys, PAGE_SIZE);
291 |         new_split->pml1[i].page_frame_number = curr_pfn;
292 |     }
293 | 
294 |     /* Now create a new PML2E entry to replace the old one. */
295 |     cr3 this_cr3;
296 |     this_cr3.flags = __readcr3();
297 | 
298 |     uintptr_t phys_pml1e = mem_va_to_pa(this_cr3, &new_split->pml1[0]);
299 | 
300 |     ept_pde new_pde = {
301 |         .read_access = true,
302 |         .write_access = true,
303 |         .execute_access = true,
304 |         .page_frame_number = (phys_pml1e / PAGE_SIZE)
305 |     };
306 |     target_pml2e->flags = new_pde.flags;
307 | }
308 | 
309 | struct ept_ctx *ept_init(void)
310 | {
311 |     /* Initialise SLAT/EPT for the guest.
312 |      * As we are hyperjacking this will consist of a
313 |      * 1:1 guest/host identity map to aid in conversion. */
314 |     struct ept_ctx *ctx = vmem_alloc(sizeof(struct ept_ctx), MEM_WRITE);
315 |     die_on(!ctx, "Unable to allocate context for EPT.");
316 | 
317 |     /* Gather the MTRR layout list so that this can be used in the future. */
318 |     gather_mtrr_list(ctx);
319 | 
320 |     /* Configure the EPT pointer. */
321 |     cr3 this_cr3;
322 |     this_cr3.flags = __readcr3();
323 | 
324 |     uintptr_t phys_pml4 = mem_va_to_pa(this_cr3, ctx->pml4);
325 |     ctx->ept_ptr.page_walk_length = 3;
326 |     ctx->ept_ptr.memory_type = MEMORY_TYPE_WB;
327 |     ctx->ept_ptr.page_frame_number = phys_pml4 / PAGE_SIZE;
328 | 
329 |     /* Fill out the first top level 512GiB entry.
330 |      * We don't need to do the others as it's HIGHLY unlikely
331 |      * that this will ever be ran on a 512GiB system. */
332 |     uintptr_t phys_pml3 = mem_va_to_pa(this_cr3, ctx->pml3);
333 |     ctx->pml4[0].read_access = true;
334 |     ctx->pml4[0].write_access = true;
335 |     ctx->pml4[0].execute_access = true;
336 |     ctx->pml4[0].page_frame_number = phys_pml3 / PAGE_SIZE;
337 | 
338 |     /* Configure the lower level PML3 table,
339 |      * each entry indicates 1GiB of physical memory,
340 |      * therefore the first 512GiB is identity mapped. */
341 |     for (int i = 0; i < ENTRIES_PER_TABLE; i++) {
342 |         ctx->pml3[i].read_access = true;
343 |         ctx->pml3[i].write_access = true;
344 |         ctx->pml3[i].execute_access = true;
345 | 
346 |         uintptr_t phys_pml2 = mem_va_to_pa(this_cr3, &ctx->pml2[i][0]);
347 |         ctx->pml3[i].page_frame_number = phys_pml2 / PAGE_SIZE;
348 |     }
349 | 
350 |     /* Loop every 1 GiB of RAM (PML3). */
351 |     for (int i = 0; i < ENTRIES_PER_TABLE; i++) {
352 |         /* Loop every 2 MiB within that GiB. */
353 |         for (int j = 0; j < ENTRIES_PER_TABLE; j++) {
354 |             ctx->pml2[i][j].read_access = true;
355 |             ctx->pml2[i][j].write_access = true;
356 |             ctx->pml2[i][j].execute_access = true;
357 |             ctx->pml2[i][j].large_page = true;
358 |             ctx->pml2[i][j].page_frame_number = (i * ENTRIES_PER_TABLE) + j;
359 | 
360 |             uintptr_t phys_addr = ctx->pml2[i][j].page_frame_number * MiB(2);
361 | 
362 |             /* Calculate the memory type for this entry. */
363 |             uint32_t mem_type = calc_mem_type(ctx, phys_addr, MiB(2));
364 | 
365 |             /*
366 |              * If the memory type is invalid (too large but semi in a 2MiB page)
367 |              * Then let's split the it into smaller PML1E entries and then re-calc
368 |              * the memory type individually.
369 |              */
370 |             if (mem_type != MEMORY_TYPE_INVALID) {
371 |                 ctx->pml2[i][j].memory_type = mem_type;
372 |             } else {
373 |                 DEBUG_PRINT("Memory type for 0x%lX is invalid, splitting.", phys_addr);
374 |                 ept_split_large_page(ctx, phys_addr);
375 |             }
376 |         }
377 |     }
378 | 
379 |     return ctx;
380 | }
381 | 
382 | eptp *ept_get_pointer(struct ept_ctx *ctx)
383 | {
384 |     return &ctx->ept_ptr;
385 | }
386 | 
387 | ept_pde_2mb *get_ept_pml2e(struct ept_ctx *ctx, uintptr_t phys_addr)
388 | {
389 |     uint64_t pml4_idx = ADDRMASK_EPT_PML4_INDEX(phys_addr);
390 |     die_on(pml4_idx, "Cannot support PML4E[%d] above 0 (512GiB)", pml4_idx);
391 | 
392 |     uint64_t pml3_idx = ADDRMASK_EPT_PML3_INDEX(phys_addr);
393 |     uint64_t pml2_idx = ADDRMASK_EPT_PML2_INDEX(phys_addr);
394 |     return &ctx->pml2[pml3_idx][pml2_idx];
395 | }
396 | 
397 | ept_pte *ept_get_pml1e(struct ept_ctx *ctx, uintptr_t phys_addr)
398 | {
399 |     /* First get the PML2E.
400 |      * If the current page is a large page (2MiB) then we
401 |      * should proceed with splitting this PML2E into standard
402 |      * pages. From there we can then return that value. */
403 |     ept_pde_2mb *target_pml2e_2mb = get_ept_pml2e(ctx, phys_addr);
404 |     die_on(!target_pml2e_2mb, "Invalid PML2E for addr 0x%lX", phys_addr);
405 | 
406 |     if (target_pml2e_2mb->large_page) {
407 |         /* Split the page, and then ensure we invalidate and flush the
408 |          * EPT cache. */
409 |         ept_split_large_page(ctx, phys_addr);
410 |         ept_invalidate_and_flush(ctx);
411 |     }
412 | 
413 |     /* Now re-cast the PML2E/PDE as it should be split,
414 |      * then we can just return the correct value. */
415 |     ept_pde *target_pml2e = (ept_pde *)target_pml2e_2mb;
416 | 
417 |     ept_pte *pml1_table = (ept_pte *)((uintptr_t)target_pml2e->page_frame_number * PAGE_SIZE);
418 |     return &pml1_table[ADDRMASK_EPT_PML1_INDEX(phys_addr)];
419 | }
420 | 
421 | void ept_invalidate_and_flush(struct ept_ctx *ctx)
422 | {
423 |     invept_descriptor desc = {
424 |         .ept_pointer = ctx->ept_ptr.flags,
425 |         .reserved = 0
426 |     };
427 |     __invept(invvpid_all_context, &desc);
428 | }


--------------------------------------------------------------------------------
/hypervisor/vmm/handler.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include "platform/intrin.h"
  3 | #include "memory/mem.h"
  4 | #include "memory/vmem.h"
  5 | #include "interrupt/idt.h"
  6 | #include "vmm_common.h"
  7 | #include "handler.h"
  8 | #include "ia32_compact.h"
  9 | 
 10 | struct vmexit_handler {
 11 |     /* Doubly linked list so multiple exit handlers can be daisy chained. */
 12 |     struct vmexit_handler *next, *prev;
 13 |     /* The callback for the exit to be called. */
 14 |     vmexit_cbk_t callback;
 15 |     /* Callback specific data. */
 16 |     void *opaque;
 17 |     /* If called, prevent other daisy chained callbacks from being called. */
 18 |     bool override;
 19 | };
 20 | 
 21 | struct handler_ctx {
 22 |     /* An array of exit handler structures.
 23 |      * Each VMEXIT can have multiple handlers daisy chained.
 24 |      * therefore we have to keep track of pointers.
 25 |      *
 26 |      * We can store up to the maximum exit reason which is XRSTORS
 27 |      * (well at least currently). */
 28 |     #define MAX_EXIT_HANDLERS VMX_EXIT_REASON_XRSTORS
 29 |     struct vmexit_handler *handlers[MAX_EXIT_HANDLERS];
 30 |     /* Back reference to the VMM context */
 31 |     struct vmm_ctx *vmm;
 32 | };
 33 | 
 34 | static void handle_cached_interrupts(struct vcpu_ctx *vcpu)
 35 | {
 36 |     /*
 37 |      * Check to see if there are any pending interrupts
 38 |      * to be delivered that were caught from the host IDT
 39 |      * that need to be redirected to the guest.
 40 |      * 
 41 |      * We only do this if there is NOT already a pending
 42 |      * interrupt.
 43 |      */
 44 |     if (vcpu->cached_int.pending) {
 45 |         DEBUG_PRINT("Forwarding vector 0x%lX error code 0x%lX",
 46 |                       vcpu->cached_int.vector, vcpu->cached_int.code);
 47 |         vmm_inject_guest_event(vcpu->cached_int.vector, vcpu->cached_int.code);
 48 |         vcpu->cached_int.pending = false;
 49 |     }
 50 | }
 51 | 
 52 | static void handle_cpuid(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
 53 | {
 54 |     /* Override leafs. */
 55 |     #define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
 56 |     #define HYPERV_CPUID_INTERFACE 0x40000001
 57 | 
 58 |     /* Bitmasks in certain leafs. */
 59 |     static const size_t CPUID_VI_BIT_HYPERVISOR_PRESENT = 0x80000000;
 60 | 
 61 |     (void)opaque;
 62 | 
 63 |     /* Read the CPUID into the leafs array. */
 64 |     uint64_t leaf = vcpu->guest_context.rax;
 65 |     uint64_t sub_leaf = vcpu->guest_context.rcx;
 66 |     int out_regs[4] = { 0 };
 67 |     __cpuidex(out_regs, leaf, sub_leaf);
 68 | 
 69 |     /* Override certain target leafs. */
 70 | #pragma GCC diagnostic push
 71 | #pragma GCC diagnostic ignored "-Wmultichar"
 72 |     switch (leaf) {
 73 |     case CPUID_VERSION_INFO:
 74 |         out_regs[2] &= ~(uint64_t)CPUID_VI_BIT_HYPERVISOR_PRESENT;
 75 |         break;
 76 |     case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
 77 |         out_regs[0] = HYPERV_CPUID_INTERFACE;
 78 |         out_regs[1] = 'csac';
 79 |         out_regs[2] = '\0eda';
 80 |         out_regs[3] = '\0\0\0\0';
 81 |         break;
 82 |     case HYPERV_CPUID_INTERFACE:
 83 |         out_regs[0] = 'csac';
 84 |         out_regs[1] = 0;
 85 |         out_regs[2] = 0;
 86 |         out_regs[3] = 0;
 87 |         break;
 88 |     }
 89 | #pragma GCC diagnostic pop
 90 | 
 91 |     DEBUG_PRINT("CPUID leaf 0x%lX sub_leaf 0x%lX - 0x%lX 0x%lX 0x%lX 0x%lX",
 92 |                   leaf, sub_leaf, out_regs[0], out_regs[1], out_regs[2], out_regs[3]);
 93 | 
 94 |     /* Store these leafs back into the guest context and move to next. */
 95 |     vcpu->guest_context.rax = out_regs[0];
 96 |     vcpu->guest_context.rbx = out_regs[1];
 97 |     vcpu->guest_context.rcx = out_regs[2];
 98 |     vcpu->guest_context.rdx = out_regs[3];
 99 |     *move_to_next = true;
100 | }
101 | 
102 | static void handle_xsetbv(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
103 | {
104 |     static const exception_error_code DEFAULT_EC = { 0 };
105 | 
106 |     (void)opaque;
107 | 
108 |     /* Check to ensure that os_xsave is enabled. */
109 |     cr4 guest_cr4;
110 |     guest_cr4.flags = __vmread(VMCS_GUEST_CR4);
111 |     if (!guest_cr4.os_xsave) {
112 |         DEBUG_PRINT("XSETBV when CR4.os_xsave not set");
113 |         vmm_inject_guest_event(invalid_opcode, DEFAULT_EC);
114 |         *move_to_next = false;
115 |         return;
116 |     }
117 | 
118 |     /* Check that a valid XCR index is set (only 0 supported). */
119 |     uint32_t field = (uint32_t)vcpu->guest_context.rcx;
120 |     if (field) {
121 |         DEBUG_PRINT("XSETBV invalid XCR field 0x%X", field);
122 |         vmm_inject_guest_event(general_protection, DEFAULT_EC);
123 |         *move_to_next = false;
124 |         return;
125 |     }
126 | 
127 |     /*
128 |      * Running XSETBV requires os_xsave to be set in CR4
129 |      * this is not the cast in an EFI booted environment
130 |      * so we enable it before the call.
131 |      */
132 |     cr4 host_cr4;
133 |     host_cr4.flags = __readcr4();
134 |     host_cr4.os_xsave = true;
135 |     __writecr4(host_cr4.flags);
136 | 
137 |     uint64_t value = (vcpu->guest_context.rdx << 32) | (uint32_t)vcpu->guest_context.rax;
138 | 
139 |     DEBUG_PRINT("XSETBV field 0x%lX value 0x%lX", field, value);
140 |     __xsetbv(field, value);
141 |     
142 |     *move_to_next = true;
143 | }
144 | 
145 | static void handle_invd(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
146 | {
147 |     (void)vcpu;
148 |     (void)opaque;
149 | 
150 |     DEBUG_PRINT("INVD");
151 |     __invd();
152 |     *move_to_next = true;
153 | }
154 | 
155 | static void handle_init_signal(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
156 | {
157 |     (void)opaque;
158 | 
159 |     /* Control registers. */
160 |     cr0 guest_cr0 = {
161 |         .extension_type = true,
162 |         .numeric_error = true,
163 |         .not_write_through = true,
164 |         .cache_disable = true
165 |     };
166 | 
167 |     __vmwrite(VMCS_GUEST_CR0, guest_cr0.flags);
168 | 
169 |     __vmwrite(VMCS_GUEST_CR3, 0);
170 | 
171 |     cr4 guest_cr4 = { 
172 |         .vmx_enable = true
173 |      };
174 |     __vmwrite(VMCS_GUEST_CR4, guest_cr4.flags);
175 | 
176 |     /* Configure the GDT entries & access rights. */
177 |     vmx_segment_access_rights guest_ar = { 0 };
178 |     guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_CODE_ERA;
179 |     guest_ar.descriptor_type = true;
180 |     guest_ar.present = true;
181 | 
182 |     __vmwrite(VMCS_GUEST_CS_SEL, 0xf000);
183 |     __vmwrite(VMCS_GUEST_CS_BASE, 0xffff0000);
184 |     __vmwrite(VMCS_GUEST_CS_LIMIT, 0xffff);
185 |     __vmwrite(VMCS_GUEST_CS_ACCESS_RIGHTS, guest_ar.flags);
186 | 
187 |     guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_DATA_RWA;
188 |     __vmwrite(VMCS_GUEST_SS_SEL, 0);
189 |     __vmwrite(VMCS_GUEST_SS_BASE, 0);
190 |     __vmwrite(VMCS_GUEST_SS_LIMIT, 0xffff);
191 |     __vmwrite(VMCS_GUEST_SS_ACCESS_RIGHTS, guest_ar.flags);
192 |     __vmwrite(VMCS_GUEST_DS_SEL, 0);
193 |     __vmwrite(VMCS_GUEST_DS_BASE, 0);
194 |     __vmwrite(VMCS_GUEST_DS_LIMIT, 0xffff);
195 |     __vmwrite(VMCS_GUEST_DS_ACCESS_RIGHTS, guest_ar.flags);
196 |     __vmwrite(VMCS_GUEST_ES_SEL, 0);
197 |     __vmwrite(VMCS_GUEST_ES_BASE, 0);
198 |     __vmwrite(VMCS_GUEST_ES_LIMIT, 0xffff);
199 |     __vmwrite(VMCS_GUEST_ES_ACCESS_RIGHTS, guest_ar.flags);
200 |     __vmwrite(VMCS_GUEST_FS_SEL, 0);
201 |     __vmwrite(VMCS_GUEST_FS_BASE, 0);
202 |     __vmwrite(VMCS_GUEST_FS_LIMIT, 0xffff);
203 |     __vmwrite(VMCS_GUEST_FS_ACCESS_RIGHTS, guest_ar.flags);
204 |     __vmwrite(VMCS_GUEST_GS_SEL, 0);
205 |     __vmwrite(VMCS_GUEST_GS_BASE, 0);
206 |     __vmwrite(VMCS_GUEST_GS_LIMIT, 0xffff);
207 |     __vmwrite(VMCS_GUEST_GS_ACCESS_RIGHTS, guest_ar.flags);
208 | 
209 |     __vmwrite(VMCS_GUEST_GDTR_BASE, 0);
210 |     __vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff);
211 |     __vmwrite(VMCS_GUEST_IDTR_BASE, 0);
212 |     __vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff);
213 | 
214 |     guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_LDT;
215 |     guest_ar.descriptor_type = false;
216 |     __vmwrite(VMCS_GUEST_LDTR_SEL, 0);
217 |     __vmwrite(VMCS_GUEST_LDTR_BASE, 0);
218 |     __vmwrite(VMCS_GUEST_LDTR_LIMIT, 0xffff);
219 |     __vmwrite(VMCS_GUEST_LDTR_ACCESS_RIGHTS, guest_ar.flags);
220 | 
221 |     guest_ar.type = SEGMENT_DESCRIPTOR_TYPE_TSS_BUSY;
222 |     __vmwrite(VMCS_GUEST_TR_SEL, 0);
223 |     __vmwrite(VMCS_GUEST_TR_BASE, 0);
224 |     __vmwrite(VMCS_GUEST_TR_LIMIT, 0xffff);
225 |     __vmwrite(VMCS_GUEST_TR_ACCESS_RIGHTS, guest_ar.flags);
226 | 
227 |     /* Configure some extra functionality. */
228 |     __vmwrite(VMCS_GUEST_EFER, 0);
229 |     __vmwrite(VMCS_GUEST_FS_BASE, 0);
230 |     __vmwrite(VMCS_GUEST_GS_BASE, 0);
231 | 
232 |     /* Configure the debug registers. */
233 |     // __writedr(0, 0);
234 |     // __writedr(1, 0);
235 |     // __writedr(2, 0);
236 |     // __writedr(3, 0);
237 |     // __writedr(6, 0xffff0ff0);
238 |     __vmwrite(VMCS_GUEST_DR7, 0x400);
239 | 
240 |     /* Configure the guest context. */
241 |     struct vcpu_context *guest_context = &vcpu->guest_context;
242 | 
243 |     cpuid_eax_01 version_info;
244 |     CPUID_LEAF_READ(CPUID_VERSION_INFO, version_info);
245 |     guest_context->rdx = 0x600 | ((uint64_t)version_info.eax.extended_model_id << 16);
246 |     guest_context->rbx = 0;
247 |     guest_context->rcx = 0;
248 |     guest_context->rsi = 0;
249 |     guest_context->rdi = 0;
250 |     guest_context->rbp = 0;
251 |     guest_context->r8 = 0;
252 |     guest_context->r9 = 0;
253 |     guest_context->r10 = 0;
254 |     guest_context->r11 = 0;
255 |     guest_context->r12 = 0;
256 |     guest_context->r13 = 0;
257 |     guest_context->r14 = 0;
258 |     guest_context->r15 = 0;
259 | 
260 |     /* Configure instruction pointer, stack etc. */
261 |     rfl guest_rflags = {
262 |         .read_as_1 = true
263 |     };
264 | 
265 |     __vmwrite(VMCS_GUEST_RFLAGS, guest_rflags.flags);
266 |     __vmwrite(VMCS_GUEST_RIP, 0xfff0);
267 |     __vmwrite(VMCS_GUEST_RSP, 0);
268 | 
269 |     /* Configure the entry controls. */
270 |     ia32_vmx_entry_ctls_register entry_ctls = {
271 |         .flags = __vmread(VMCS_CTRL_ENTRY)
272 |     };
273 |     entry_ctls.ia32e_mode_guest = false;
274 |     __vmwrite(VMCS_CTRL_ENTRY, entry_ctls.flags);
275 | 
276 |     /* Indicate we are waiting for SIPI. */
277 |     __vmwrite(VMCS_GUEST_ACTIVITY_STATE, vmx_wait_for_sipi);
278 |     *move_to_next = false;
279 | }
280 | 
281 | static void handle_sipi(struct vcpu_ctx *vcpu, void *opaque, bool *move_to_next)
282 | {
283 |     (void)vcpu;
284 |     (void)opaque;
285 | 
286 |     uint64_t vector = __vmread(VMCS_EXIT_QUALIFICATION);
287 | 
288 |     __vmwrite(VMCS_GUEST_CS_SEL, vector << 8);
289 |     __vmwrite(VMCS_GUEST_CS_BASE, vector << 12);
290 |     __vmwrite(VMCS_GUEST_RIP, 0);
291 | 
292 |     __vmwrite(VMCS_GUEST_ACTIVITY_STATE, vmx_active);
293 |     *move_to_next = false;
294 | }
295 | 
296 | static void dump_guest_state(void)
297 | {
298 |     //
299 |     // 16-Bit Guest-State Fields
300 |     //
301 |     debug_print("Guest ES Selector              = %016llx", __vmread(VMCS_GUEST_ES_SEL));
302 |     debug_print("Guest CS Selector              = %016llx", __vmread(VMCS_GUEST_CS_SEL));
303 |     debug_print("Guest SS Selector              = %016llx", __vmread(VMCS_GUEST_SS_SEL));
304 |     debug_print("Guest DS Selector              = %016llx", __vmread(VMCS_GUEST_DS_SEL));
305 |     debug_print("Guest FS Selector              = %016llx", __vmread(VMCS_GUEST_FS_SEL));
306 |     debug_print("Guest GS Selector              = %016llx", __vmread(VMCS_GUEST_GS_SEL));
307 |     debug_print("Guest LDTR Selector            = %016llx", __vmread(VMCS_GUEST_LDTR_SEL));
308 |     debug_print("Guest TR Selector              = %016llx", __vmread(VMCS_GUEST_TR_SEL));
309 |     debug_print("Guest interrupt status         = %016llx", __vmread(VMCS_GUEST_INTR_STATUS));
310 |     debug_print("PML index                      = %016llx", __vmread(VMCS_GUEST_PML_INDEX));
311 | 
312 |     //
313 |     // 64-Bit Guest-State Fields
314 |     //
315 |     debug_print("VMCS link pointer              = %016llx", __vmread(VMCS_GUEST_VMCS_LINK_PTR));
316 |     debug_print("Guest IA32_DEBUGCTL            = %016llx", __vmread(VMCS_GUEST_DEBUGCTL));
317 |     debug_print("Guest IA32_PAT                 = %016llx", __vmread(VMCS_GUEST_PAT));
318 |     debug_print("Guest IA32_EFER                = %016llx", __vmread(VMCS_GUEST_EFER));
319 |     debug_print("Guest IA32_PERF_GLOBAL_CTRL    = %016llx", __vmread(VMCS_GUEST_PERF_GLOBAL_CTRL));
320 |     debug_print("Guest PDPTE0                   = %016llx", __vmread(VMCS_GUEST_PDPTE0));
321 |     debug_print("Guest PDPTE1                   = %016llx", __vmread(VMCS_GUEST_PDPTE1));
322 |     debug_print("Guest PDPTE2                   = %016llx", __vmread(VMCS_GUEST_PDPTE2));
323 |     debug_print("Guest PDPTE3                   = %016llx", __vmread(VMCS_GUEST_PDPTE3));
324 |     debug_print("Guest IA32_BNDCFGS             = %016llx", __vmread(VMCS_GUEST_BNDCFGS));
325 |     debug_print("Guest IA32_RTIT_CTL            = %016llx", __vmread(VMCS_GUEST_RTIT_CTL));
326 | 
327 |     //
328 |     // 32-Bit Guest-State Fields
329 |     //
330 |     debug_print("Guest ES Limit                 = %016llx", __vmread(VMCS_GUEST_ES_LIMIT));
331 |     debug_print("Guest CS Limit                 = %016llx", __vmread(VMCS_GUEST_CS_LIMIT));
332 |     debug_print("Guest SS Limit                 = %016llx", __vmread(VMCS_GUEST_SS_LIMIT));
333 |     debug_print("Guest DS Limit                 = %016llx", __vmread(VMCS_GUEST_DS_LIMIT));
334 |     debug_print("Guest FS Limit                 = %016llx", __vmread(VMCS_GUEST_FS_LIMIT));
335 |     debug_print("Guest GS Limit                 = %016llx", __vmread(VMCS_GUEST_GS_LIMIT));
336 |     debug_print("Guest LDTR Limit               = %016llx", __vmread(VMCS_GUEST_LDTR_LIMIT));
337 |     debug_print("Guest TR Limit                 = %016llx", __vmread(VMCS_GUEST_TR_LIMIT));
338 |     debug_print("Guest GDTR limit               = %016llx", __vmread(VMCS_GUEST_GDTR_LIMIT));
339 |     debug_print("Guest IDTR limit               = %016llx", __vmread(VMCS_GUEST_IDTR_LIMIT));
340 |     debug_print("Guest ES access rights         = %016llx", __vmread(VMCS_GUEST_ES_ACCESS_RIGHTS));
341 |     debug_print("Guest CS access rights         = %016llx", __vmread(VMCS_GUEST_CS_ACCESS_RIGHTS));
342 |     debug_print("Guest SS access rights         = %016llx", __vmread(VMCS_GUEST_SS_ACCESS_RIGHTS));
343 |     debug_print("Guest DS access rights         = %016llx", __vmread(VMCS_GUEST_DS_ACCESS_RIGHTS));
344 |     debug_print("Guest FS access rights         = %016llx", __vmread(VMCS_GUEST_FS_ACCESS_RIGHTS));
345 |     debug_print("Guest GS access rights         = %016llx", __vmread(VMCS_GUEST_GS_ACCESS_RIGHTS));
346 |     debug_print("Guest LDTR access rights       = %016llx", __vmread(VMCS_GUEST_LDTR_ACCESS_RIGHTS));
347 |     debug_print("Guest TR access rights         = %016llx", __vmread(VMCS_GUEST_TR_ACCESS_RIGHTS));
348 |     debug_print("Guest interruptibility state   = %016llx", __vmread(VMCS_GUEST_INTERRUPTIBILITY_STATE));
349 |     debug_print("Guest activity state           = %016llx", __vmread(VMCS_GUEST_ACTIVITY_STATE));
350 |     debug_print("Guest SMBASE                   = %016llx", __vmread(VMCS_GUEST_SMBASE));
351 |     debug_print("Guest IA32_SYSENTER_CS         = %016llx", __vmread(VMCS_GUEST_SYSENTER_CS));
352 |     debug_print("VMX-preemption timer value     = %016llx", __vmread(VMCS_GUEST_PREEMPT_TIMER_VALUE));
353 | 
354 |     //
355 |     // Natural-Width Guest-State Fields
356 |     //
357 |     debug_print("Guest CR0                      = %016llx", __vmread(VMCS_GUEST_CR0));
358 |     debug_print("Guest CR3                      = %016llx", __vmread(VMCS_GUEST_CR3));
359 |     debug_print("Guest CR4                      = %016llx", __vmread(VMCS_GUEST_CR4));
360 |     debug_print("Guest ES Base                  = %016llx", __vmread(VMCS_GUEST_ES_BASE));
361 |     debug_print("Guest CS Base                  = %016llx", __vmread(VMCS_GUEST_CS_BASE));
362 |     debug_print("Guest SS Base                  = %016llx", __vmread(VMCS_GUEST_SS_BASE));
363 |     debug_print("Guest DS Base                  = %016llx", __vmread(VMCS_GUEST_DS_BASE));
364 |     debug_print("Guest FS Base                  = %016llx", __vmread(VMCS_GUEST_FS_BASE));
365 |     debug_print("Guest GS Base                  = %016llx", __vmread(VMCS_GUEST_GS_BASE));
366 |     debug_print("Guest LDTR base                = %016llx", __vmread(VMCS_GUEST_LDTR_BASE));
367 |     debug_print("Guest TR base                  = %016llx", __vmread(VMCS_GUEST_TR_BASE));
368 |     debug_print("Guest GDTR base                = %016llx", __vmread(VMCS_GUEST_GDTR_BASE));
369 |     debug_print("Guest IDTR base                = %016llx", __vmread(VMCS_GUEST_IDTR_BASE));
370 |     debug_print("Guest DR7                      = %016llx", __vmread(VMCS_GUEST_DR7));
371 |     debug_print("Guest RSP                      = %016llx", __vmread(VMCS_GUEST_RSP));
372 |     debug_print("Guest RIP                      = %016llx", __vmread(VMCS_GUEST_RIP));
373 |     debug_print("Guest RFLAGS                   = %016llx", __vmread(VMCS_GUEST_RFLAGS));
374 |     debug_print("Guest pending debug exceptions = %016llx", __vmread(VMCS_GUEST_PENDING_DEBUG_EXCEPTIONS));
375 |     debug_print("Guest IA32_SYSENTER_ESP        = %016llx", __vmread(VMCS_GUEST_SYSENTER_ESP));
376 |     debug_print("Guest IA32_SYSENTER_EIP        = %016llx", __vmread(VMCS_GUEST_SYSENTER_EIP));
377 | }
378 | 
379 | static void dump_guest_page(void)
380 | {
381 |     static const int PRINT_PER_LINE = 32;
382 |     uintptr_t guest_rip = __vmread(VMCS_GUEST_RIP);
383 |     uint8_t *page_start = (uint8_t *)(guest_rip & ~0xFFFull);
384 |     uint8_t *page_end = (uint8_t *)(page_start + PAGE_SIZE);
385 | 
386 |     debug_print("Dumping guest page of RIP=0x%lX 0x%lX-0x%lX", guest_rip, page_start, page_end);
387 | 
388 |     for (uint8_t* current = page_start; current < page_end; current += PRINT_PER_LINE) {
389 |         print_buffer("0x%016lX          ", current);
390 |         for (int i = 0; i < PRINT_PER_LINE; i++) {
391 |             print_buffer("%02X ", current[i]);
392 |         }
393 |         print_buffer("\r\n");
394 |     }
395 |     print_buffer("\r\n");
396 | }
397 | 
398 | static void dump_state(struct vcpu_ctx *vcpu)
399 | {
400 |     (void)vcpu;
401 |     dump_guest_state();
402 |     dump_guest_page();
403 | }
404 | 
405 | static void handle_exit_reason(struct vcpu_ctx *vcpu)
406 | {
407 |     /* Retrieve the handler context from the vCPU. */
408 |     struct handler_ctx *ctx = vcpu->vmm->handler;
409 | 
410 |     /* Determine the exit reason and then call the appropriate exit handler. */
411 |     size_t reason = __vmread(VMCS_EXIT_REASON) & 0xFFFF;
412 |     bool move_to_next_instr = false;
413 | 
414 |     /* Check to see if the exit reason is out of range. */
415 |     die_on(reason >= MAX_EXIT_HANDLERS,
416 |         "Exit reason 0x%lX rip 0x%lX not within range of handler table",
417 |         reason, vcpu->guest_context.rip);
418 | 
419 |     struct vmexit_handler *exit_head = ctx->handlers[reason];
420 | 
421 | 
422 |     if (!exit_head) {
423 |         dump_state(vcpu);
424 |         /* Check to see if we actually have a handler for it. */
425 |         uint8_t *rip_bytes = (uint8_t *)vcpu->guest_context.rip;
426 |         die_on(!exit_head, "vcpu=%d no exit reason handlers for 0x%lX at rip 0x%lX "
427 |             "rip[0]=%02X rip[1]=%02X rip[2]=%02X rip[3]=%02X rip[4]=%02X rip[5]=%02X",
428 |             vcpu->idx, reason, vcpu->guest_context.rip,
429 |             rip_bytes[0], rip_bytes[1], rip_bytes[2], rip_bytes[3], rip_bytes[4], rip_bytes[5]);
430 |     }
431 | 
432 |     /* Iterate from tail to head calling each, stop at override. */
433 |     struct vmexit_handler *curr_handler = exit_head->prev;
434 |     while (true) {
435 |         /* Call the callback for the VMEXIT. If this handler has the override
436 |          * set, this means we SHOULDN'T call any others, so break. */
437 |         curr_handler->callback(vcpu, curr_handler->opaque, &move_to_next_instr);
438 |     
439 |         if (curr_handler->override)
440 |             break;
441 | 
442 |         if (curr_handler == exit_head)
443 |             break;
444 |         
445 |         curr_handler = curr_handler->prev;
446 |     };
447 | 
448 |     /*
449 |      * If the exit handler indicated to increment RIP, do so.
450 |      * We cannot use the guest_context.rip field to increment as
451 |      * this will not be restored on re-enter to the guest, we need to
452 |      * directly write to the VMCS field instead.
453 |      */
454 |     if (move_to_next_instr) {
455 |         size_t guest_rip = __vmread(VMCS_GUEST_RIP);
456 |         guest_rip += __vmread(VMCS_EXIT_INSTR_LENGTH);
457 |         __vmwrite(VMCS_GUEST_RIP, guest_rip);
458 |     }
459 | 
460 |     handle_cached_interrupts(vcpu);
461 | }
462 | 
463 | static void register_generic_handlers(struct handler_ctx *ctx)
464 | {
465 |     static const vmexit_cbk_t GENERIC_HANDLERS[MAX_EXIT_HANDLERS] = {
466 |         [VMX_EXIT_REASON_CPUID] = handle_cpuid,
467 |         [VMX_EXIT_REASON_XSETBV] = handle_xsetbv,
468 |         [VMX_EXIT_REASON_INVD] = handle_invd,
469 |         [VMX_EXIT_REASON_INIT_SIGNAL] = handle_init_signal,
470 |         [VMX_EXIT_REASON_SIPI] = handle_sipi,
471 |     };
472 | 
473 |     /* Register all of our generic handlers
474 |      * This is done by iterating a key/value array of exit to internal handlers. */
475 |     for (int exit_reason = 0; exit_reason < MAX_EXIT_HANDLERS; exit_reason++) {
476 |         vmexit_cbk_t cbk = GENERIC_HANDLERS[exit_reason];
477 | 
478 |         if (cbk) {
479 |             DEBUG_PRINT("Registering generic exit 0x%lX callback 0x%lX", exit_reason, cbk);
480 |             handler_register_exit(ctx, exit_reason, cbk, NULL, false);
481 |         }
482 |     }
483 | }
484 | 
485 | struct handler_ctx *handler_init(struct vmm_ctx *vmm)
486 | {
487 |     struct handler_ctx *ctx = vmem_alloc(sizeof(struct handler_ctx), MEM_WRITE);
488 |     die_on(!ctx, "Unable to allocate context for VMEXIT handlers.");
489 |     vmm->handler = ctx;
490 |     ctx->vmm = vmm;
491 | 
492 |     register_generic_handlers(ctx);
493 |     return ctx;
494 | }
495 | 
496 | void handler_register_exit(struct handler_ctx *ctx,
497 |                            size_t exit_reason,
498 |                            vmexit_cbk_t callback,
499 |                            void *opaque,
500 |                            bool override)
501 | {
502 |     /* Ensure synchronization. */
503 |     spin_lock(&ctx->vmm->lock);
504 | 
505 |     die_on(exit_reason >= MAX_EXIT_HANDLERS, "Invalid exit handler index 0x%lX", exit_reason);
506 | 
507 |     /* Allocate a new vmexit handler. */
508 |     struct vmexit_handler *new_handler = vmem_alloc(sizeof(struct vmexit_handler), MEM_WRITE);
509 |     die_on(!new_handler, "Unable to allocate memory for VMEXIT handler.");
510 | 
511 |     /* Fill out the information for the handler. */
512 |     new_handler->callback = callback;
513 |     new_handler->opaque = opaque;
514 |     new_handler->override = override;
515 | 
516 |     /* Now manipulate the linked list for vmexit entry. */
517 |     struct vmexit_handler **exit_base = &ctx->handlers[exit_reason];
518 |     struct vmexit_handler *exit_head = *exit_base;
519 |     new_handler->next = NULL;
520 |     if (exit_head == NULL) {
521 |         /* No current exit handlers. */
522 |         new_handler->prev = new_handler;
523 |         *exit_base = new_handler;
524 |     } else {
525 |         /* An event already exists.
526 |          * Add our new handler to the tail of the list. */
527 |         struct vmexit_handler *exit_tail = exit_head->prev;
528 |         die_on(override && exit_tail->override,
529 |                "Cannot override if an override for exit 0x%X already set",
530 |                exit_reason);
531 | 
532 |         new_handler->prev = exit_tail;
533 |         exit_tail->next = new_handler;
534 |     }
535 | 
536 |     DEBUG_PRINT("VMEXIT registered for 0x%lX cbk 0x%lX opaque 0x%lX override %d",
537 |                 exit_reason, callback, opaque, override);
538 |     spin_unlock(&ctx->vmm->lock);
539 | }
540 | 
541 | __attribute__((ms_abi)) void handler_guest_to_host(struct vcpu_context *guest_ctx)
542 | {
543 |     /*
544 |      * Because we had to use RCX in shim_guest_to_host as a parameter
545 |      * for the __capture_context which gets passed to this handler
546 |      * we have to retrieve this value and store it back in the guest_context. */
547 |     uint64_t restore_rsp = guest_ctx->rsp + sizeof(uint64_t);
548 |     guest_ctx->rcx = *(uint64_t *)((uintptr_t)guest_ctx - sizeof(guest_ctx->rcx));
549 | 
550 |     /* Set what the guest RIP and RSP were. */
551 |     guest_ctx->rsp = __vmread(VMCS_GUEST_RSP);
552 |     guest_ctx->rip = __vmread(VMCS_GUEST_RIP);
553 | 
554 |     /*
555 |      * Find the vcpu_ctx structure by backtracing from the guest_ctx which we can
556 |      * assume was stored on the host_stack.
557 |      */
558 |     struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx();
559 | 
560 |     /* Indicate running as host and then copy the guest context from stack to vcpu struct. */
561 |     vcpu->guest_context = *guest_ctx;
562 | 
563 |     /* Handle the VMEXIT reason. */
564 |     handle_exit_reason(vcpu);
565 | 
566 |     /* 
567 |      * Trigger the return back into guest mode, by adjusting RIP in our stored
568 |      * guest context and then adjust the context RIP to our VMRESUME handler.
569 |      */
570 |     vcpu->guest_context.rsp = restore_rsp;
571 |     vcpu->guest_context.rip = (uint64_t)__vmresume;
572 |     __restore_context(&vcpu->guest_context);
573 | }


--------------------------------------------------------------------------------
/hypervisor/vmm/vmm.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG_MODULE
  2 | #include "platform/standard.h"
  3 | #include "platform/intrin.h"
  4 | #include "platform/util.h"
  5 | #include "memory/pmem.h"
  6 | #include "memory/mem.h"
  7 | #include "vmm.h"
  8 | #include "ept.h"
  9 | #include "handler.h"
 10 | #include "vmcall.h"
 11 | #include "nested.h"
 12 | #include "shim.h"
 13 | #include "impl_hooks.h"
 14 | #include "ia32_compact.h"
 15 | 
 16 | /* The main VMM that will initialise the hypervisor,
 17 |  * currently this is aimed at only targetting x86_64 platforms. */
 18 | 
 19 | /* Mask used to ignore the ring level when specifying a selector. */
 20 | #define IGNORE_RPL_MASK (~3)
 21 | 
 22 | /* Holds information on a GDT entry. */
 23 | struct gdt_entry {
 24 |     size_t base;
 25 |     uint32_t limit;
 26 |     uint16_t sel;
 27 |     vmx_segment_access_rights access;
 28 | };
 29 | 
 30 | /* Global const offset used for calculating where the stack pointer
 31 |  * will be offsetted from the vcpu_ctx upon a guest_to_host or vice versa
 32 |  * transition. We have to use this as our shim assembly code uses it for
 33 |  * reclaiming the vcpu_ctx pointer upon hyperjacking. */
 34 | const size_t VMM_HYPERJACK_STACK_OFFSET = offsetof(struct vcpu_ctx, host_stack) +
 35 |                                         HOST_STACK_SIZE - sizeof(struct vcpu_context);
 36 | 
 37 | static void probe_capabilities()
 38 | {
 39 |     DEBUG_PRINT("Checking CPU capabilities.");
 40 | 
 41 |     cpuid_eax_01 version_info;
 42 |     int rc = CPUID_LEAF_READ(CPUID_VERSION_INFO, version_info);
 43 |     die_on(!rc, "Unable to query version information.");
 44 |     die_on(!version_info.ecx.virtual_machine_extensions, "No virtual machine extensions.");
 45 | 
 46 |     cpuid_eax_80000001 extend_cpu;
 47 |     rc = CPUID_LEAF_READ(CPUID_EXTENDED_CPU_SIGNATURE, extend_cpu);
 48 |     die_on(!rc, "Unable to read extended CPUID signature.");
 49 |     die_on(!extend_cpu.edx.pages_1gb_available, "No 1GB pages support.");
 50 | 
 51 |     ia32_feature_control_register feature_control;
 52 |     feature_control.flags = rdmsr(IA32_FEATURE_CONTROL);
 53 |     die_on(!feature_control.lock_bit, "Lock bit not set.");
 54 |     die_on(!feature_control.enable_vmx_outside_smx, "VMX not enabled outside SMX.");
 55 | 
 56 |     ia32_vmx_ept_vpid_cap_register ept_vpid;
 57 |     ept_vpid.flags = rdmsr(IA32_VMX_EPT_VPID_CAP);
 58 |     die_on(!ept_vpid.page_walk_length_4, "EPT PML4 not supported.");
 59 |     die_on(!ept_vpid.memory_type_write_back, "EPT memory type WB not supported.");
 60 |     die_on(!ept_vpid.pde_2mb_pages, "EPT 2MB pages not supported.");
 61 | 
 62 |     DEBUG_PRINT("CPU seems to provide all capabilities needed.");
 63 | }
 64 | 
 65 | static void print_gdt(char *prefix, segment_descriptor_register_64 *gdtr)
 66 | {
 67 |     (void)prefix;
 68 |     (void)gdtr;
 69 | 
 70 |     #ifdef DEBUG_MODULE
 71 |         DEBUG_PRINT("--- %s GDT base 0x%lX limit 0x%lX", prefix, gdtr->base_address, gdtr->limit);
 72 | 
 73 |         segment_descriptor_32 *gdt = (segment_descriptor_32 *)gdtr->base_address;
 74 |         int desc_max = (gdtr->limit + 1ull) / sizeof(segment_descriptor_32);
 75 |         for (int i = 0; i < desc_max; i++) {
 76 |             segment_descriptor_32 *curr_desc = (segment_descriptor_32 *)&gdt[i];
 77 | 
 78 |             uint32_t seg_lim = (curr_desc->segment_limit_high << 16) | curr_desc->segment_limit_low;
 79 |             uintptr_t base_addr = (curr_desc->base_address_high << 24) |
 80 |                                 (curr_desc->base_address_middle << 16) |
 81 |                                 (curr_desc->base_address_low & UINT16_MAX);
 82 | 
 83 |             DEBUG_PRINT("Descriptor 0x%lX\r\n" \
 84 |                       "------ Flags 0x%X\r\n" \
 85 |                       "------ Present 0x%X\r\n" \
 86 |                       "------ Type 0x%X\r\n" \
 87 |                       "------ Segment limit 0x%X\r\n" \
 88 |                       "------ Base address 0x%lX\r\n",
 89 |                       (uintptr_t)curr_desc,
 90 |                       curr_desc->flags,
 91 |                       curr_desc->present,
 92 |                       curr_desc->type,
 93 |                       seg_lim,
 94 |                       base_addr);
 95 |         }
 96 |     #endif /* DEBUG_VMM */
 97 | }
 98 | 
 99 | static void configure_vcpu_gdt(struct gdt_config *gdt_cfg)
100 | {
101 |     /* Everything within the GDT is in linear/physical addresses
102 |      * rather than virtual, therefore we need to retrieve CR3 so
103 |      * that we can do some conversions from virt to phys. */
104 |     cr3 this_cr3;
105 |     this_cr3.flags = __readcr3();
106 | 
107 |     /* Read the original GDTR and store it so we can use it for the guest later. */
108 |     __sgdt(&gdt_cfg->guest_gdtr);
109 |     __sldt(&gdt_cfg->guest_ldtr);
110 |     die_on(!gdt_cfg->guest_gdtr.base_address, "No base address set for guest GDTR");
111 |     die_on(!gdt_cfg->guest_gdtr.limit, "No limit set for guest GDTR");
112 | 
113 |     /* For the host GDT we're going to copy the guest GDT and then append
114 |      * a TSS to the GDT as this is required for VMX to be used, unfortunately
115 |      * the UEFI environment doesn't set this up. */
116 |     memcpy(gdt_cfg->host_gdt,
117 |            (const void *)gdt_cfg->guest_gdtr.base_address,
118 |            gdt_cfg->guest_gdtr.limit);
119 | 
120 |     /* Configure the GDTR we're going to use for the host. */
121 |     uintptr_t host_gdt_phys = mem_va_to_pa(this_cr3, gdt_cfg->host_gdt);
122 |     gdt_cfg->host_gdtr.base_address = host_gdt_phys;
123 |     gdt_cfg->host_gdtr.limit = gdt_cfg->guest_gdtr.limit + sizeof(segment_descriptor_64);
124 |     DEBUG_PRINT("Host GDTR base %lX limit %lX",
125 |               gdt_cfg->host_gdtr.base_address,
126 |               gdt_cfg->host_gdtr.limit);
127 | 
128 |     /* Append the TR to the end of the GDT. */
129 |     gdt_cfg->host_tr.flags = 0;
130 |     gdt_cfg->host_tr.index = (gdt_cfg->guest_gdtr.limit + 1ull) / sizeof(segment_descriptor_32);
131 |     DEBUG_PRINT("Host TR index %d", gdt_cfg->host_tr.index);
132 | 
133 |     uintptr_t tss_pa = mem_va_to_pa(this_cr3, &gdt_cfg->host_tss);
134 |     segment_descriptor_64 tss_desc = { 0 };
135 |     tss_desc.segment_limit_low = sizeof(struct task_state_segment_64) - 1;
136 |     tss_desc.base_address_low = tss_pa & UINT16_MAX;
137 |     tss_desc.base_address_middle = (tss_pa >> 16) & UINT8_MAX;
138 |     tss_desc.base_address_high = (tss_pa >> 24) & UINT8_MAX;
139 |     tss_desc.base_address_upper = (tss_pa >> 32) & UINT32_MAX;
140 |     tss_desc.type = SEGMENT_DESCRIPTOR_TYPE_TSS_AVAILABLE;
141 |     tss_desc.present = true;
142 | 
143 |     /* Now write the newly created TSS to our host GDT. */
144 |     segment_descriptor_32 *gdt32 = (segment_descriptor_32 *)gdt_cfg->host_gdt;
145 |     segment_descriptor_64 *tss_in_gdt = (segment_descriptor_64 *)&gdt32[gdt_cfg->host_tr.index];
146 |     *tss_in_gdt = tss_desc;
147 | 
148 |     print_gdt("Host", &gdt_cfg->host_gdtr);
149 | 
150 |     /* Write the new GDTR and TR. */
151 |     uintptr_t phys_gdtr = mem_va_to_pa(this_cr3, &gdt_cfg->host_gdtr);
152 |     __lgdt((void*)phys_gdtr);
153 |     __ltr(&gdt_cfg->host_tr);
154 | }
155 | 
156 | static void capture_control_regs(struct control_registers *regs)
157 | {
158 |     regs->reg_cr0.flags = __readcr0();
159 |     regs->reg_cr3.flags = __readcr3();
160 |     regs->reg_cr4.flags = __readcr4();
161 |     regs->debugctl.flags = rdmsr(IA32_DEBUGCTL);
162 |     regs->gs_base = rdmsr(IA32_GS_BASE);
163 |     regs->dr7 = __readdr7();
164 | 
165 |     DEBUG_PRINT("--- cr0 %lX\r\n" \
166 |               "--- cr3 %lX\r\n" \
167 |               "--- cr4 %lX\r\n" \
168 |               "--- debugctl %lX\r\n" \
169 |               "--- gs_base %lX\r\n" \
170 |               "--- dr7 %lX",
171 |               regs->reg_cr0.flags, regs->reg_cr3.flags, regs->reg_cr4.flags,
172 |               regs->debugctl.flags, regs->gs_base, regs->dr7);
173 | }
174 | 
175 | static void enter_root_mode(struct vcpu_ctx *vcpu)
176 | {
177 |     /* Set up root VMXON and the guest VMCS. */
178 |     ia32_vmx_basic_register basic;
179 |     basic.flags = rdmsr(IA32_VMX_BASIC);
180 | 
181 |     memset(&vcpu->host_vmxon, 0, sizeof(vcpu->host_vmxon));
182 |     memset(&vcpu->guest_vmcs, 0, sizeof(vcpu->guest_vmcs));
183 |     vcpu->host_vmxon.revision_id = basic.vmcs_revision_id;
184 |     vcpu->guest_vmcs.revision_id = basic.vmcs_revision_id;
185 | 
186 |     /* Set the fixed requirements for the control registers for VMX. */
187 |     vcpu->guest_ctrl_regs.reg_cr0 = vmm_adjust_cr0(vcpu->guest_ctrl_regs.reg_cr0);
188 |     vcpu->guest_ctrl_regs.reg_cr4 = vmm_adjust_cr4(vcpu->guest_ctrl_regs.reg_cr4);
189 | 
190 |     /* Update host CR0/4 with new updated fields. */
191 |     __writecr0(vcpu->guest_ctrl_regs.reg_cr0.flags);
192 |     __writecr4(vcpu->guest_ctrl_regs.reg_cr4.flags);
193 | 
194 |     /* Calculate the physical addresses of vmxon and vmcs. */
195 |     cr3 this_cr3;
196 |     this_cr3.flags = __readcr3();
197 |     void *phys_vmxon = (void *)mem_va_to_pa(this_cr3, &vcpu->host_vmxon);
198 |     void *phys_vmcs = (void *)mem_va_to_pa(this_cr3, &vcpu->guest_vmcs);
199 | 
200 |     die_on(!__vmxon(&phys_vmxon), "Unable to enter VMX root mode.");
201 |     die_on(!__vmclear(&phys_vmcs), "Unable to clear VMCS.");
202 |     die_on(!__vmptrld(&phys_vmcs), "Unable to load the VMCS.");
203 | }
204 | 
205 | static uint64_t encode_msr(uint64_t ctrl, uint64_t desired)
206 | {
207 |     /*
208 |      * VMX feature/capability MSRs encode the "must be 0" bits in the high word
209 |      * of their value, and the "must be 1" bits in the low word of their value.
210 |      * Adjust any requested capability/feature based on these requirements.
211 |      */
212 |     desired &= (uint32_t)(ctrl >> 32);
213 |     desired |= (uint32_t)ctrl;
214 |     return desired;
215 | }
216 | 
217 | static void gather_gdt_entry(segment_descriptor_register_64 *gdtr, uint16_t sel,
218 |                              struct gdt_entry *entry)
219 | {
220 |     /* If the selector is not valid (0) set unusable entry. */
221 |     if (sel == 0) {
222 |         entry->sel = 0;
223 |         entry->limit = 0;
224 |         entry->base = 0;
225 |         entry->access.flags = 0;
226 |         entry->access.unusable = true;
227 |         return;
228 |     }
229 | 
230 |     /* Calculate the descriptor pointer */
231 |     segment_descriptor_64 *descriptor =
232 |         (segment_descriptor_64 *)(gdtr->base_address + (sel & IGNORE_RPL_MASK));
233 | 
234 |     /* Fill in the entry information. */
235 |     entry->sel = sel;
236 |     entry->limit = (descriptor->segment_limit_high << 16) | descriptor->segment_limit_low;
237 |     entry->base = ((size_t)descriptor->base_address_high << 24) |
238 |                   ((size_t)descriptor->base_address_middle << 16) |
239 |                   ((size_t)descriptor->base_address_low);
240 | 
241 |     if (descriptor->descriptor_type == 0) {
242 |         entry->base |= (size_t)descriptor->base_address_upper << 32;
243 |     }
244 | 
245 |     /* Access rights are defines as the middle 16 bits of the descriptor flags section. */
246 |     entry->access.flags = (descriptor->flags >> 8) & 0xFFFF;
247 |     entry->access.unusable = !descriptor->present;
248 |     entry->access.reserved_1 = 0;
249 | }
250 | 
251 | static void setup_vmcs_host(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu)
252 | {
253 |     /* Configure the host context, as we're hyperjacking we want
254 |      * to clone the original context as much as possible for ease
255 |      * of use. */
256 |     struct vcpu_context *guest_ctx = &vcpu->hyperjack_context;
257 | 
258 |     /* Write all of the selectors, ignoring the RPL for each field
259 |      * as the host environment will always be ring-0. */
260 |     __vmwrite(VMCS_HOST_CS_SEL, guest_ctx->seg_cs & IGNORE_RPL_MASK);
261 |     __vmwrite(VMCS_HOST_SS_SEL, guest_ctx->seg_ss & IGNORE_RPL_MASK);
262 |     __vmwrite(VMCS_HOST_DS_SEL, guest_ctx->seg_ds & IGNORE_RPL_MASK);
263 |     __vmwrite(VMCS_HOST_ES_SEL, guest_ctx->seg_es & IGNORE_RPL_MASK);
264 |     __vmwrite(VMCS_HOST_FS_SEL, guest_ctx->seg_fs & IGNORE_RPL_MASK);
265 |     __vmwrite(VMCS_HOST_GS_SEL, guest_ctx->seg_gs & IGNORE_RPL_MASK);
266 | 
267 |     /* As in a UEFI environment TR is not set, therefore we use our own
268 |      * generated one when we modified the GDT. */
269 |     __vmwrite(VMCS_HOST_TR_SEL, vcpu->gdt_cfg.host_tr.flags & IGNORE_RPL_MASK);
270 | 
271 |     /* Now write all of the BASE registers that are used for the host. */
272 |     __vmwrite(VMCS_HOST_GDTR_BASE, vcpu->gdt_cfg.host_gdtr.base_address);
273 |     __vmwrite(VMCS_HOST_IDTR_BASE, vmm->init.host_idtr.base_address);
274 | 
275 |     /*
276 |      * We (ab)use the GS_BASE field to store out vCPU context, so that
277 |      * when we're in host context it's easy to retrieve which vCPU we are
278 |      * via the GS_BASE field.
279 |      */
280 |     __vmwrite(VMCS_HOST_GS_BASE, (uintptr_t)vcpu);
281 | 
282 |     /* Get the GDT information for FS & TR so we can write these for the host VMCS. */
283 |     struct gdt_entry entry;
284 |     gather_gdt_entry(&vcpu->gdt_cfg.host_gdtr, guest_ctx->seg_fs, &entry);
285 |     __vmwrite(VMCS_HOST_FS_BASE, entry.base);
286 | 
287 |     gather_gdt_entry(&vcpu->gdt_cfg.host_gdtr, vcpu->gdt_cfg.host_tr.flags, &entry);
288 |     __vmwrite(VMCS_HOST_TR_BASE, entry.base);
289 | 
290 |     /* SYSENTRY fields. */
291 |     __vmwrite(VMCS_HOST_SYSENTER_ESP, rdmsr(IA32_SYSENTER_ESP));
292 |     __vmwrite(VMCS_HOST_SYSENTER_EIP, rdmsr(IA32_SYSENTER_EIP));
293 | 
294 |     /* Control registers (using our own CR3 value for paging). */
295 |     __vmwrite(VMCS_HOST_CR0, vcpu->guest_ctrl_regs.reg_cr0.flags);
296 |     __vmwrite(VMCS_HOST_CR3, vmm->init.host_cr3.flags);
297 |     __vmwrite(VMCS_HOST_CR4, vcpu->guest_ctrl_regs.reg_cr4.flags);
298 | 
299 |     /* Extended feature enable registers. */
300 |     __vmwrite(VMCS_HOST_EFER, rdmsr(IA32_EFER));
301 | 
302 |     /*
303 |     * Load the hypervisor entrypoint and stack. We give ourselves a standard
304 |     * size kernel stack (24KB) and bias for the context structure that the
305 |     * hypervisor entrypoint will push on the stack, avoiding the need for RSP
306 |     * modifying instructions in the entrypoint. Note that the CONTEXT pointer
307 |     * and thus the stack itself, must be 16-byte aligned for ABI compatibility
308 |     * with AMD64 -- specifically, XMM operations will fail otherwise, such as
309 |     * the ones that __capture_context will perform.
310 |     */
311 |     uintptr_t host_rip = (uintptr_t)shim_guest_to_host;
312 |     uintptr_t host_rsp = (uintptr_t)vcpu + VMM_HYPERJACK_STACK_OFFSET;
313 | 
314 |     __vmwrite(VMCS_HOST_RSP, host_rsp);
315 |     __vmwrite(VMCS_HOST_RIP, host_rip);
316 |     DEBUG_PRINT("VMCS_HOST_RIP: 0x%lX VMCS_HOST_RSP: 0x%lX", host_rip, host_rsp);
317 | }
318 | 
319 | __attribute__((noreturn)) static void vmm_hyperjack_handler(void)
320 | {
321 |     /*
322 |      * We are currently executing in the guest, after a successful
323 |      * initial VMLAUNCH, so now we need to return to our hyperjacking
324 |      * code path where the initial init_routine_per_vcpu::__capture_context
325 |      * took place.
326 |      *
327 |      * This time with the launched flag set, therefore the driver
328 |      * should then exit successfully.
329 |      */
330 |     struct vcpu_ctx *vcpu = vmm_get_vcpu_ctx();
331 | 
332 |     vcpu->launched = true;
333 |     __restore_context(&vcpu->hyperjack_context);
334 |     die_on(true, "Shouldn't be here context should have been restored.");
335 | }
336 | 
337 | static void setup_vmcs_guest(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu)
338 | {
339 |     /* 
340 |      * Defines a generic structure so that we can iteratively write all segment
341 |      * fields needed for the guest.
342 |      */
343 |     struct gdt_config {
344 |         uint16_t sel;
345 |         uint32_t vmcs_sel;
346 |         uint32_t vmcs_lim;
347 |         uint32_t vmcs_ar;
348 |         uint32_t vmcs_base;
349 |         segment_descriptor_register_64 *gdtr;
350 |     };
351 | 
352 |     struct vcpu_context *guest_ctx = &vcpu->hyperjack_context;
353 | 
354 |     const struct gdt_config vmcs_gdt_list[] = {
355 |         {
356 |             .sel = guest_ctx->seg_cs,
357 |             .vmcs_sel = VMCS_GUEST_CS_SEL,
358 |             .vmcs_lim = VMCS_GUEST_CS_LIMIT,
359 |             .vmcs_ar = VMCS_GUEST_CS_ACCESS_RIGHTS,
360 |             .vmcs_base = VMCS_GUEST_CS_BASE,
361 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
362 |         },
363 |         {
364 |             .sel = guest_ctx->seg_ss,
365 |             .vmcs_sel = VMCS_GUEST_SS_SEL,
366 |             .vmcs_lim = VMCS_GUEST_SS_LIMIT,
367 |             .vmcs_ar = VMCS_GUEST_SS_ACCESS_RIGHTS,
368 |             .vmcs_base = VMCS_GUEST_SS_BASE,
369 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
370 |         },
371 |         {
372 |             .sel = guest_ctx->seg_ds,
373 |             .vmcs_sel = VMCS_GUEST_DS_SEL,
374 |             .vmcs_lim = VMCS_GUEST_DS_LIMIT,
375 |             .vmcs_ar = VMCS_GUEST_DS_ACCESS_RIGHTS,
376 |             .vmcs_base = VMCS_GUEST_DS_BASE,
377 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
378 |         },
379 |         {
380 |             .sel = guest_ctx->seg_es,
381 |             .vmcs_sel = VMCS_GUEST_ES_SEL,
382 |             .vmcs_lim = VMCS_GUEST_ES_LIMIT,
383 |             .vmcs_ar = VMCS_GUEST_ES_ACCESS_RIGHTS,
384 |             .vmcs_base = VMCS_GUEST_ES_BASE,
385 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
386 |         },
387 |         {
388 |             .sel = guest_ctx->seg_fs,
389 |             .vmcs_sel = VMCS_GUEST_FS_SEL,
390 |             .vmcs_lim = VMCS_GUEST_FS_LIMIT,
391 |             .vmcs_ar = VMCS_GUEST_FS_ACCESS_RIGHTS,
392 |             .vmcs_base = VMCS_GUEST_FS_BASE,
393 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
394 |         },
395 |         {
396 |             .sel = guest_ctx->seg_gs,
397 |             .vmcs_sel = VMCS_GUEST_GS_SEL,
398 |             .vmcs_lim = VMCS_GUEST_GS_LIMIT,
399 |             .vmcs_ar = VMCS_GUEST_GS_ACCESS_RIGHTS,
400 |             .vmcs_base = VMCS_GUEST_GS_BASE,
401 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
402 |         },
403 |         {
404 |             .sel = vcpu->gdt_cfg.host_tr.flags,
405 |             .vmcs_sel = VMCS_GUEST_TR_SEL,
406 |             .vmcs_lim = VMCS_GUEST_TR_LIMIT,
407 |             .vmcs_ar = VMCS_GUEST_TR_ACCESS_RIGHTS,
408 |             .vmcs_base = VMCS_GUEST_TR_BASE,
409 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
410 |         },
411 |         {
412 |             .sel = vcpu->gdt_cfg.guest_ldtr.flags,
413 |             .vmcs_sel = VMCS_GUEST_LDTR_SEL,
414 |             .vmcs_lim = VMCS_GUEST_LDTR_LIMIT,
415 |             .vmcs_ar = VMCS_GUEST_LDTR_ACCESS_RIGHTS,
416 |             .vmcs_base = VMCS_GUEST_LDTR_BASE,
417 |             .gdtr = &vcpu->gdt_cfg.host_gdtr,
418 |         }
419 |     };
420 | 
421 |     /*
422 |      * For TR and LDTR we cannot use what the guest has, as to be able to
423 |      * successfully VMENTER we need a TR and LDTR set (unfortunately)
424 |      */
425 | 
426 |     /* For each selector, generate it's entry and then fill in relevant fields. */
427 |     for (size_t i = 0; i < ARRAY_SIZE(vmcs_gdt_list); i++) {
428 |         const struct gdt_config *curr_cfg = &vmcs_gdt_list[i];
429 |         struct gdt_entry entry;
430 | 
431 |         gather_gdt_entry(curr_cfg->gdtr, curr_cfg->sel, &entry);
432 | 
433 |         /*
434 |          * This a shitty hack/workaround.
435 |          * Some BIOS/UEFI environments will temporarily go into protected
436 |          * mode on the AP's, if this is the case we need to ensure that
437 |          * any operations that use the stack won't cause a #SS. Really we
438 |          * should re-build the GDT entries upon re-entry to protected mode
439 |          * but we can potentially do that in the future if we really need
440 |          * to.
441 |          */
442 |         if (curr_cfg->vmcs_sel == VMCS_GUEST_SS_SEL)
443 |             entry.limit = UINT32_MAX;
444 | 
445 |         if (curr_cfg->vmcs_sel)
446 |             __vmwrite(curr_cfg->vmcs_sel, entry.sel);
447 |         if (curr_cfg->vmcs_lim)
448 |             __vmwrite(curr_cfg->vmcs_lim, entry.limit);
449 |         if (curr_cfg->vmcs_ar)
450 |             __vmwrite(curr_cfg->vmcs_ar, entry.access.flags);
451 |         if (curr_cfg->vmcs_base)
452 |             __vmwrite(curr_cfg->vmcs_base, entry.base);
453 | 
454 |         DEBUG_PRINT("VMX GDT Entry: %d\r\n" \
455 |                   "--- VMCS SEL [0x%lX]: 0x%lX\r\n" \
456 |                   "--- VMCS LIM [0x%lX]: 0x%lX\r\n" \
457 |                   "--- VMCS AR [0x%lX]: 0x%lX\r\n" \
458 |                   "--- VMCS BASE [0x%lX]: 0x%lX\r\n",
459 |                   i, 
460 |                   curr_cfg->vmcs_sel, entry.sel,
461 |                   curr_cfg->vmcs_lim, entry.limit,
462 |                   curr_cfg->vmcs_ar, entry.access.flags,
463 |                   curr_cfg->vmcs_base, entry.base);
464 |     }
465 | 
466 |     /* Now write the GDTR for the guest (due to TR & LDTR restrictions re-use guest). */
467 |     __vmwrite(VMCS_GUEST_GDTR_BASE, vcpu->gdt_cfg.guest_gdtr.base_address);
468 |     __vmwrite(VMCS_GUEST_GDTR_LIMIT, vcpu->gdt_cfg.guest_gdtr.limit);
469 | 
470 |     /* Now write IDTR, we can ACTUALLY use the guest IDT thank god... */
471 |     __vmwrite(VMCS_GUEST_IDTR_BASE, vmm->init.guest_idtr.base_address);
472 |     __vmwrite(VMCS_GUEST_IDTR_LIMIT, vmm->init.guest_idtr.limit);
473 | 
474 |     /* Control registers. */
475 |     __vmwrite(VMCS_CTRL_CR0_READ_SHADOW, vcpu->guest_ctrl_regs.reg_cr0.flags);
476 |     __vmwrite(VMCS_GUEST_CR0, vcpu->guest_ctrl_regs.reg_cr0.flags);
477 | 
478 |     __vmwrite(VMCS_CTRL_CR3_TARGET_COUNT, 0);
479 |     __vmwrite(VMCS_GUEST_CR3, vmm->init.guest_cr3.flags);
480 | 
481 |     /* 
482 |      * If a bit is set in the CR4 guest/host mask, this means that
483 |      * the value from the CR4 shadow will be utilised when in guest
484 |      * mode/non-root mode.
485 |      *
486 |      * As such here, we indicate the VMXE bit in CR4 is set to be intercepted
487 |      * and therefore we then indicate that VMXE is not indicated by clearing
488 |      * the VMXE bit in the CR4 read shadow.
489 |      */
490 | 
491 |     __vmwrite(VMCS_CTRL_CR4_MASK, CR4_VMXE_MASK);
492 |     __vmwrite(VMCS_CTRL_CR4_READ_SHADOW, vcpu->guest_ctrl_regs.reg_cr4.flags & ~CR4_VMXE_MASK);
493 |     __vmwrite(VMCS_GUEST_CR4, vcpu->guest_ctrl_regs.reg_cr4.flags);
494 | 
495 |     /* Debug kernel registers. */
496 |     __vmwrite(VMCS_GUEST_DEBUGCTL, vcpu->guest_ctrl_regs.debugctl.flags);
497 |     __vmwrite(VMCS_GUEST_DR7, vcpu->guest_ctrl_regs.dr7);
498 | 
499 |     /* Extended feature enable registers. */
500 |     __vmwrite(VMCS_GUEST_EFER, rdmsr(IA32_EFER));
501 | 
502 |     /*
503 |      * We (ab)use the GS_BASE field to store out vCPU context, we do this
504 |      * for guest context so that we can retrieve it in our hyperjack handler.
505 |      *
506 |      * This will eventually get overwritten/nulled when the guest OS boots
507 |      * after anyway.
508 |      */
509 |     __vmwrite(VMCS_GUEST_GS_BASE, (uintptr_t)vcpu);
510 | 
511 |     /*
512 |     * Finally, load the guest stack, instruction pointer, and rflags, which
513 |     * corresponds exactly to the location where __capture_context will return
514 |     * to inside of init_routine_per_vcpu.
515 |     *
516 |     * Use a dirty hack where we set the RSP to the kernel stack and the address
517 |     * then set the first parameter on the stack to point to the vCPU context
518 |     * so our host_to_guest shim can retrieve this. This MUST be accessible
519 |     * within the guest CR3 (we can't use our vmem from host).
520 |     * so we use the physical address as we should beidentity mapped.
521 |     */
522 |     cr3 this_cr3;
523 |     this_cr3.flags = __readcr3();
524 |     uintptr_t phys_vcpu_ctx = (uintptr_t)mem_va_to_pa(this_cr3, vcpu);
525 | 
526 |     uintptr_t guest_rip = (uintptr_t)vmm_hyperjack_handler;
527 |     uintptr_t guest_rsp = (uintptr_t)phys_vcpu_ctx + VMM_HYPERJACK_STACK_OFFSET;
528 | 
529 |     __vmwrite(VMCS_GUEST_RFLAGS, guest_ctx->e_flags);
530 |     __vmwrite(VMCS_GUEST_RSP, guest_rsp);
531 |     __vmwrite(VMCS_GUEST_RIP, guest_rip);
532 |     DEBUG_PRINT("VMCS_GUEST_RIP: 0x%lX VMCS_GUEST_RSP: 0x%lX PHYS_VCPU: 0x%lX",
533 |               guest_rip, guest_rsp, phys_vcpu_ctx);
534 | }
535 | 
536 | static void setup_vmcs_generic(struct vmm_ctx *vmm, struct vcpu_ctx *vcpu)
537 | {
538 |     /* Set up the link pointer. */
539 |     __vmwrite(VMCS_GUEST_VMCS_LINK_PTR, ~0ull);
540 | 
541 |     /* Set up the EPT fields. */
542 |     __vmwrite(VMCS_CTRL_EPTP, ept_get_pointer(vmm->ept)->flags);
543 |     __vmwrite(VMCS_CTRL_VPID, 1);
544 | 
545 |     /* Load the MSR bitmap with the bitmap which will be used to
546 |      * indicate which MSR reads/writes to trap on.
547 |      * Setting all bits indicates trap on read & write.
548 |      *
549 |      * NOTE: MSR trapping for EVERY read/write is very intensive
550 |      * trying to boot an actual OS with this is terrible. Instead
551 |      * in the future maybe we can use this to target MSR read/write
552 |      * when a specific CR3 is loaded/stored so we can do targetted
553 |      * reading of drivers etc. */
554 |     cr3 this_cr3;
555 |     this_cr3.flags = __readcr3();
556 | 
557 |     memset(vcpu->msr_trap_bitmap, 0x00, PAGE_SIZE);
558 |     __vmwrite(VMCS_CTRL_MSR_BITMAP, mem_va_to_pa(this_cr3, vcpu->msr_trap_bitmap));
559 | 
560 |     /* We don't explicitly enable any pin-based options ourselves, but there may
561 |      * be some required by the procesor, the encode the MSR to include these. */
562 |     uint32_t encoded = encode_msr(rdmsr(IA32_VMX_TRUE_PINBASED_CTLS), 0);
563 |     __vmwrite(VMCS_CTRL_PIN_EXEC, encoded);
564 | 
565 |     /*
566 |      * Enable support for RDTSCP and XSAVES/XRESTORES in the guest. Windows 10
567 |      * makes use of both of these instructions if the CPU supports it. By using
568 |      * adjustMSR, these options will be ignored if this processor does
569 |      * not actually support the instructions to begin with.
570 |      *
571 |      * Also enable EPT support, for additional performance and ability to trap
572 |      * memory access efficiently.
573 |      */
574 |     ia32_vmx_procbased_ctls2_register proc_ctls2 = { 0 };
575 |     proc_ctls2.enable_rdtscp = true;
576 |     proc_ctls2.enable_invpcid = true;
577 |     proc_ctls2.enable_xsaves = true;
578 |     proc_ctls2.unrestricted_guest = true;
579 |     proc_ctls2.enable_ept = true;
580 |     proc_ctls2.enable_vpid = true;
581 |     encoded = encode_msr(rdmsr(IA32_VMX_PROCBASED_CTLS2), proc_ctls2.flags);
582 |     __vmwrite(VMCS_CTRL_PROC_EXEC2, encoded);
583 | 
584 |     /* In order for the proc_ctls2 & MSR bitmap to be used we need to explicitly
585 |      * enable them. */
586 |     ia32_vmx_procbased_ctls_register proc_ctls = { 0 };
587 |     proc_ctls.use_msr_bitmaps = true;
588 |     proc_ctls.activate_secondary_controls = true;
589 |     encoded = encode_msr(rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS), proc_ctls.flags);
590 |     __vmwrite(VMCS_CTRL_PROC_EXEC, encoded);
591 | 
592 |     /* Make sure to exit in x64 mode at all times. */
593 |     ia32_vmx_exit_ctls_register exit_ctls = { 0 };
594 |     exit_ctls.save_debug_controls = true;
595 |     exit_ctls.save_ia32_efer = true;
596 |     exit_ctls.host_address_space_size = true;
597 |     exit_ctls.load_ia32_efer = true;
598 |     encoded = encode_msr(rdmsr(IA32_VMX_TRUE_EXIT_CTLS), exit_ctls.flags);
599 |     __vmwrite(VMCS_CTRL_EXIT, encoded);
600 | 
601 |     /* Make sure when we re-enter it's back in x64 mode too. */
602 |     ia32_vmx_entry_ctls_register entry_ctls = { 0 };
603 |     entry_ctls.load_debug_controls = true;
604 |     entry_ctls.load_ia32_efer = true;
605 |     entry_ctls.ia32e_mode_guest = true;
606 |     encoded = encode_msr(rdmsr(IA32_VMX_TRUE_ENTRY_CTLS), entry_ctls.flags);
607 |     __vmwrite(VMCS_CTRL_ENTRY, encoded);
608 | }
609 | 
610 | static void hook_init_root_mode(struct vcpu_ctx *vcpu)
611 | {
612 |     /*
613 |      * A hook into where we're running as ROOT/HOST mode
614 |      * this can be used for initialising other application
615 |      * specific logic as required.
616 |      */
617 |     (void)vcpu;
618 | 
619 |     /* We CANNOT enable the memory hider at this point.
620 |      * Otherwise when we do a VMLAUNCH and hyperjack back
621 |      * into our driver we'll get a EPT violation.
622 |      *
623 |      * So, we should add a VMCALL routine to enable the
624 |      * hiding. This unfortunately will have to be triggered
625 |      * by a seperate module (plugin or UM process) as due
626 |      * to the reason mentioned above, we cannot hide our own
627 |      * memory when we need to execute from it in guest mode
628 |      * still. */
629 | }
630 | 
631 | static void __attribute__((ms_abi)) init_routine_per_vcpu(void *opaque)
632 | {
633 |     struct vmm_ctx *vmm = (struct vmm_ctx *)opaque;
634 | 
635 |     /* Ensure that the correct host CR3 is loaded for this vCPU.
636 |      * This SHOULD be the case for vCPU 0 as they were originally set during
637 |      * the initialisation of the modules, however for other vCPU's they
638 |      * will be set to what they were before the hyperjack. */
639 |     __writecr3(vmm->init.host_cr3.flags);
640 | 
641 |     size_t proc_idx;
642 |     die_on(!impl_get_processor_index(&proc_idx), "Unable to retrieve processor index.");
643 |     die_on(proc_idx >= VCPU_MAX, "vCPU index greater than supported by VMM.");
644 | 
645 |     /* Create the vCPU context structure.
646 |      * THIS MUST BE ALLOCATED AS CONTIGUOUS PHYSICAL MEMORY AS WHEN EXITING
647 |      * DURING THE HOST TO GUEST HYPERJACKING SHIM WE DON'T WANT NON-CONTIGUOUS
648 |      * PMEM WHICH WOULD CAUSE A POTENTIAL OVERWRITE OF WRONG PHYSICAL MEMORY. */
649 |     struct vcpu_ctx *vcpu = (struct vcpu_ctx *)pmem_alloc_contiguous(sizeof(struct vcpu_ctx));
650 |     die_on(!vcpu, "Unable to allocate vCPU %ld context.", proc_idx);
651 | 
652 |     /* Set the pointer so we can retrive VMM context from the vCPU context. */
653 |     vcpu->vmm = vmm;
654 |     vcpu->idx = proc_idx;
655 | 
656 |     DEBUG_PRINT("Initialising vCPU %ld vmm ctx 0x%lX vcpu ctx 0x%lX.", proc_idx, vmm, vcpu);
657 | 
658 |     /* Configure the host GDT. */
659 |     configure_vcpu_gdt(&vcpu->gdt_cfg);
660 | 
661 |     /* Capturing control registers & context for the vCPU,
662 |      * as to what the guest should be restored to once hyperjacked. */
663 |     capture_control_regs(&vcpu->guest_ctrl_regs);
664 |     __capture_context(&vcpu->hyperjack_context);
665 | 
666 |     /* First pass (before hypervised) this shall be false as we
667 |      * haven't hyperjacked yet. Upon restoration of the context
668 |      * from within the guest (which will lead us up to just after __capture_context)
669 |      * we need to do nothing and effectively "complete" loading of the driver. */
670 |     if (!vcpu->launched) {
671 |         enter_root_mode(vcpu);
672 | 
673 |         /* Set up VMCS */
674 |         setup_vmcs_generic(vmm, vcpu);
675 |         setup_vmcs_host(vmm, vcpu);
676 |         setup_vmcs_guest(vmm, vcpu);
677 | 
678 |         #ifdef CONFIG_NESTED
679 |             nested_init_vcpu(vcpu);
680 |         #endif
681 | 
682 |         /* Hook for when running as ROOT mode but we have not
683 |          * yet launched back into non-root/GUEST. */
684 |         hook_init_root_mode(vcpu);
685 | 
686 |         /* Attempt VMLAUNCH. */
687 |         DEBUG_PRINT("Attempting VMLAUNCH on vCPU %d with ctx: 0x%lX", proc_idx, vcpu);
688 |         __vmlaunch();
689 | 
690 |         /* 
691 |          * If we have got to this point, VMLAUNCH failed.
692 |          * Get failure reason and dump info for debugging. */
693 |         size_t fail_reason = __vmread(VMCS_VM_INSTR_ERROR);
694 |         debug_print("Failed to launch VMX with reason: 0x%lX", fail_reason);
695 |         while (1) {};
696 |     }
697 | }
698 | 
699 | static bool event_has_error_code(exception_vector vector)
700 | {
701 |     switch (vector) {
702 |         case double_fault:
703 |         case invalid_tss:
704 |         case segment_not_present:
705 |         case stack_segment_fault:
706 |         case general_protection:
707 |         case page_fault:
708 |         case alignment_check:
709 |             return true;
710 |         default:
711 |             return false;
712 |     }
713 | }
714 | 
715 | void vmm_init(struct vmm_init_params *params)
716 | {
717 |     /* Make sure the CPU supports all of the features required. */
718 |     probe_capabilities();
719 | 
720 |     /* Static allocation of the global vCPU context. This has to be
721 |      * static rather than dynamically allocated as the other vCPUs
722 |      * that are started will not have the host CR3 set, hence they
723 |      * will not have access to our dynamically allocated memory. */
724 |     static struct vmm_ctx vmm = { 0 };
725 |     memcpy(&vmm.init, params, sizeof(*params));
726 | 
727 |     spin_init(&vmm.lock);
728 |     vmm.ept = ept_init();
729 | 
730 |     handler_init(&vmm);
731 |     vmcall_init(&vmm);
732 | 
733 |     #ifdef CONFIG_NESTED
734 |         nested_init(&vmm);
735 |     #endif
736 | 
737 |     /* Run the initialisation routine on each LP. */
738 |     die_on(!impl_run_all_processors(init_routine_per_vcpu, &vmm),
739 |            "Unable to run VMM init routine on each LP.");
740 | }
741 | 
742 | void vmm_inject_guest_event(exception_vector vector, exception_error_code code)
743 | {
744 |     vmentry_interrupt_info info = { 0 };
745 |     interruption_type type;
746 | 
747 |     /* Determine if the vector has an error code associated with it. */
748 |     info.deliver_error_code = event_has_error_code(vector);
749 | 
750 |     /* Determine the interrupt type. */
751 |     switch (vector) {
752 |         case breakpoint:
753 |         case overflow:
754 |             type = software_exception;
755 |             break;
756 |         case debug:
757 |             type = privileged_software_exception;
758 |             break;
759 |         default:
760 |             type = hardware_exception;
761 |             break;
762 |     }
763 | 
764 |     /* Override if vector was greater than 0x20 */
765 |     if (vector >= 0x20) {
766 |         type = external_interrupt;
767 |     }
768 | 
769 |     info.vector = vector;
770 |     info.interruption_type = type;
771 |     info.valid = true;
772 |     __vmwrite(VMCS_CTRL_ENTRY_INTERRUPTION_INFO, info.flags);
773 | 
774 |     if (info.deliver_error_code)
775 |         __vmwrite(VMCS_CTRL_ENTRY_EXCEPTION_ERRCODE, code.flags);
776 | 
777 |     DEBUG_PRINT("Injected guest event 0x%lX type 0x%lX code 0x%lX", vector, type, code.flags);
778 | }


--------------------------------------------------------------------------------
/examples/uefi/mp_service.h:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |   When installed, the MP Services Protocol produces a collection of services
  3 |   that are needed for MP management.
  4 | 
  5 |   The MP Services Protocol provides a generalized way of performing following tasks:
  6 |     - Retrieving information of multi-processor environment and MP-related status of
  7 |       specific processors.
  8 |     - Dispatching user-provided function to APs.
  9 |     - Maintain MP-related processor status.
 10 | 
 11 |   The MP Services Protocol must be produced on any system with more than one logical
 12 |   processor.
 13 | 
 14 |   The Protocol is available only during boot time.
 15 | 
 16 |   MP Services Protocol is hardware-independent. Most of the logic of this protocol
 17 |   is architecturally neutral. It abstracts the multi-processor environment and
 18 |   status of processors, and provides interfaces to retrieve information, maintain,
 19 |   and dispatch.
 20 | 
 21 |   MP Services Protocol may be consumed by ACPI module. The ACPI module may use this
 22 |   protocol to retrieve data that are needed for an MP platform and report them to OS.
 23 |   MP Services Protocol may also be used to program and configure processors, such
 24 |   as MTRR synchronization for memory space attributes setting in DXE Services.
 25 |   MP Services Protocol may be used by non-CPU DXE drivers to speed up platform boot
 26 |   by taking advantage of the processing capabilities of the APs, for example, using
 27 |   APs to help test system memory in parallel with other device initialization.
 28 |   Diagnostics applications may also use this protocol for multi-processor.
 29 | 
 30 | Copyright (c) 2006 - 2017, Intel Corporation. All rights reserved.<BR>
 31 | SPDX-License-Identifier: BSD-2-Clause-Patent
 32 | 
 33 |   @par Revision Reference:
 34 |   This Protocol is defined in the UEFI Platform Initialization Specification 1.2,
 35 |   Volume 2:Driver Execution Environment Core Interface.
 36 | 
 37 | **/
 38 | 
 39 | #ifndef _MP_SERVICE_PROTOCOL_H_
 40 | #define _MP_SERVICE_PROTOCOL_H_
 41 | 
 42 | #include "efi.h"
 43 | 
 44 | /* MODIFICATION: Added this manually from PI spec so I don't need to include other headers. */
 45 | typedef VOID (EFIAPI *EFI_AP_PROCEDURE) (IN VOID *ProcedureArgument);
 46 | 
 47 | ///
 48 | /// Global ID for the EFI_MP_SERVICES_PROTOCOL.
 49 | ///
 50 | #define EFI_MP_SERVICES_PROTOCOL_GUID \
 51 |   { \
 52 |     0x3fdda605, 0xa76e, 0x4f46, {0xad, 0x29, 0x12, 0xf4, 0x53, 0x1b, 0x3d, 0x08} \
 53 |   }
 54 | 
 55 | ///
 56 | /// Value used in the NumberProcessors parameter of the GetProcessorInfo function
 57 | ///
 58 | #define CPU_V2_EXTENDED_TOPOLOGY  BIT24
 59 | 
 60 | ///
 61 | /// Forward declaration for the EFI_MP_SERVICES_PROTOCOL.
 62 | ///
 63 | typedef struct _EFI_MP_SERVICES_PROTOCOL EFI_MP_SERVICES_PROTOCOL;
 64 | 
 65 | ///
 66 | /// Terminator for a list of failed CPUs returned by StartAllAPs().
 67 | ///
 68 | #define END_OF_CPU_LIST  0xffffffff
 69 | 
 70 | ///
 71 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and
 72 | /// indicates whether the processor is playing the role of BSP. If the bit is 1,
 73 | /// then the processor is BSP. Otherwise, it is AP.
 74 | ///
 75 | #define PROCESSOR_AS_BSP_BIT  0x00000001
 76 | 
 77 | ///
 78 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and
 79 | /// indicates whether the processor is enabled. If the bit is 1, then the
 80 | /// processor is enabled. Otherwise, it is disabled.
 81 | ///
 82 | #define PROCESSOR_ENABLED_BIT  0x00000002
 83 | 
 84 | ///
 85 | /// This bit is used in the StatusFlag field of EFI_PROCESSOR_INFORMATION and
 86 | /// indicates whether the processor is healthy. If the bit is 1, then the
 87 | /// processor is healthy. Otherwise, some fault has been detected for the processor.
 88 | ///
 89 | #define PROCESSOR_HEALTH_STATUS_BIT  0x00000004
 90 | 
 91 | ///
 92 | /// Structure that describes the pyhiscal location of a logical CPU.
 93 | ///
 94 | typedef struct {
 95 |   ///
 96 |   /// Zero-based physical package number that identifies the cartridge of the processor.
 97 |   ///
 98 |   UINT32    Package;
 99 |   ///
100 |   /// Zero-based physical core number within package of the processor.
101 |   ///
102 |   UINT32    Core;
103 |   ///
104 |   /// Zero-based logical thread number within core of the processor.
105 |   ///
106 |   UINT32    Thread;
107 | } EFI_CPU_PHYSICAL_LOCATION;
108 | 
109 | ///
110 | ///  Structure that defines the 6-level physical location of the processor
111 | ///
112 | typedef struct {
113 |   ///
114 |   ///    Package     Zero-based physical package number that identifies the cartridge of the processor.
115 |   ///
116 |   UINT32    Package;
117 |   ///
118 |   ///    Module      Zero-based physical module number within package of the processor.
119 |   ///
120 |   UINT32    Module;
121 |   ///
122 |   ///    Tile        Zero-based physical tile number within module of the processor.
123 |   ///
124 |   UINT32    Tile;
125 |   ///
126 |   ///    Die         Zero-based physical die number within tile of the processor.
127 |   ///
128 |   UINT32    Die;
129 |   ///
130 |   ///     Core        Zero-based physical core number within die of the processor.
131 |   ///
132 |   UINT32    Core;
133 |   ///
134 |   ///     Thread      Zero-based logical thread number within core of the processor.
135 |   ///
136 |   UINT32    Thread;
137 | } EFI_CPU_PHYSICAL_LOCATION2;
138 | 
139 | typedef union {
140 |   /// The 6-level physical location of the processor, including the
141 |   /// physical package number that identifies the cartridge, the physical
142 |   /// module number within package, the physical tile number within the module,
143 |   /// the physical die number within the tile, the physical core number within
144 |   /// package, and logical thread number within core.
145 |   EFI_CPU_PHYSICAL_LOCATION2    Location2;
146 | } EXTENDED_PROCESSOR_INFORMATION;
147 | 
148 | ///
149 | /// Structure that describes information about a logical CPU.
150 | ///
151 | typedef struct {
152 |   ///
153 |   /// The unique processor ID determined by system hardware.  For IA32 and X64,
154 |   /// the processor ID is the same as the Local APIC ID. Only the lower 8 bits
155 |   /// are used, and higher bits are reserved.  For IPF, the lower 16 bits contains
156 |   /// id/eid, and higher bits are reserved.
157 |   ///
158 |   UINT64    ProcessorId;
159 |   ///
160 |   /// Flags indicating if the processor is BSP or AP, if the processor is enabled
161 |   /// or disabled, and if the processor is healthy. Bits 3..31 are reserved and
162 |   /// must be 0.
163 |   ///
164 |   /// <pre>
165 |   /// BSP  ENABLED  HEALTH  Description
166 |   /// ===  =======  ======  ===================================================
167 |   ///  0      0       0     Unhealthy Disabled AP.
168 |   ///  0      0       1     Healthy Disabled AP.
169 |   ///  0      1       0     Unhealthy Enabled AP.
170 |   ///  0      1       1     Healthy Enabled AP.
171 |   ///  1      0       0     Invalid. The BSP can never be in the disabled state.
172 |   ///  1      0       1     Invalid. The BSP can never be in the disabled state.
173 |   ///  1      1       0     Unhealthy Enabled BSP.
174 |   ///  1      1       1     Healthy Enabled BSP.
175 |   /// </pre>
176 |   ///
177 |   UINT32                            StatusFlag;
178 |   ///
179 |   /// The physical location of the processor, including the physical package number
180 |   /// that identifies the cartridge, the physical core number within package, and
181 |   /// logical thread number within core.
182 |   ///
183 |   EFI_CPU_PHYSICAL_LOCATION         Location;
184 |   ///
185 |   /// The extended information of the processor. This field is filled only when
186 |   /// CPU_V2_EXTENDED_TOPOLOGY is set in parameter ProcessorNumber.
187 |   EXTENDED_PROCESSOR_INFORMATION    ExtendedInformation;
188 | } EFI_PROCESSOR_INFORMATION;
189 | 
190 | /**
191 |   This service retrieves the number of logical processor in the platform
192 |   and the number of those logical processors that are enabled on this boot.
193 |   This service may only be called from the BSP.
194 | 
195 |   This function is used to retrieve the following information:
196 |     - The number of logical processors that are present in the system.
197 |     - The number of enabled logical processors in the system at the instant
198 |       this call is made.
199 | 
200 |   Because MP Service Protocol provides services to enable and disable processors
201 |   dynamically, the number of enabled logical processors may vary during the
202 |   course of a boot session.
203 | 
204 |   If this service is called from an AP, then EFI_DEVICE_ERROR is returned.
205 |   If NumberOfProcessors or NumberOfEnabledProcessors is NULL, then
206 |   EFI_INVALID_PARAMETER is returned. Otherwise, the total number of processors
207 |   is returned in NumberOfProcessors, the number of currently enabled processor
208 |   is returned in NumberOfEnabledProcessors, and EFI_SUCCESS is returned.
209 | 
210 |   @param[in]  This                        A pointer to the EFI_MP_SERVICES_PROTOCOL
211 |                                           instance.
212 |   @param[out] NumberOfProcessors          Pointer to the total number of logical
213 |                                           processors in the system, including the BSP
214 |                                           and disabled APs.
215 |   @param[out] NumberOfEnabledProcessors   Pointer to the number of enabled logical
216 |                                           processors that exist in system, including
217 |                                           the BSP.
218 | 
219 |   @retval EFI_SUCCESS             The number of logical processors and enabled
220 |                                   logical processors was retrieved.
221 |   @retval EFI_DEVICE_ERROR        The calling processor is an AP.
222 |   @retval EFI_INVALID_PARAMETER   NumberOfProcessors is NULL.
223 |   @retval EFI_INVALID_PARAMETER   NumberOfEnabledProcessors is NULL.
224 | 
225 | **/
226 | typedef
227 | EFI_STATUS
228 | (EFIAPI *EFI_MP_SERVICES_GET_NUMBER_OF_PROCESSORS)(
229 |   IN  EFI_MP_SERVICES_PROTOCOL  *This,
230 |   OUT UINTN                     *NumberOfProcessors,
231 |   OUT UINTN                     *NumberOfEnabledProcessors
232 |   );
233 | 
234 | /**
235 |   Gets detailed MP-related information on the requested processor at the
236 |   instant this call is made. This service may only be called from the BSP.
237 | 
238 |   This service retrieves detailed MP-related information about any processor
239 |   on the platform. Note the following:
240 |     - The processor information may change during the course of a boot session.
241 |     - The information presented here is entirely MP related.
242 | 
243 |   Information regarding the number of caches and their sizes, frequency of operation,
244 |   slot numbers is all considered platform-related information and is not provided
245 |   by this service.
246 | 
247 |   @param[in]  This                  A pointer to the EFI_MP_SERVICES_PROTOCOL
248 |                                     instance.
249 |   @param[in]  ProcessorNumber       The handle number of processor.
250 |   @param[out] ProcessorInfoBuffer   A pointer to the buffer where information for
251 |                                     the requested processor is deposited.
252 | 
253 |   @retval EFI_SUCCESS             Processor information was returned.
254 |   @retval EFI_DEVICE_ERROR        The calling processor is an AP.
255 |   @retval EFI_INVALID_PARAMETER   ProcessorInfoBuffer is NULL.
256 |   @retval EFI_NOT_FOUND           The processor with the handle specified by
257 |                                   ProcessorNumber does not exist in the platform.
258 | 
259 | **/
260 | typedef
261 | EFI_STATUS
262 | (EFIAPI *EFI_MP_SERVICES_GET_PROCESSOR_INFO)(
263 |   IN  EFI_MP_SERVICES_PROTOCOL   *This,
264 |   IN  UINTN                      ProcessorNumber,
265 |   OUT EFI_PROCESSOR_INFORMATION  *ProcessorInfoBuffer
266 |   );
267 | 
268 | /**
269 |   This service executes a caller provided function on all enabled APs. APs can
270 |   run either simultaneously or one at a time in sequence. This service supports
271 |   both blocking and non-blocking requests. The non-blocking requests use EFI
272 |   events so the BSP can detect when the APs have finished. This service may only
273 |   be called from the BSP.
274 | 
275 |   This function is used to dispatch all the enabled APs to the function specified
276 |   by Procedure.  If any enabled AP is busy, then EFI_NOT_READY is returned
277 |   immediately and Procedure is not started on any AP.
278 | 
279 |   If SingleThread is TRUE, all the enabled APs execute the function specified by
280 |   Procedure one by one, in ascending order of processor handle number. Otherwise,
281 |   all the enabled APs execute the function specified by Procedure simultaneously.
282 | 
283 |   If WaitEvent is NULL, execution is in blocking mode. The BSP waits until all
284 |   APs finish or TimeoutInMicroSecs expires. Otherwise, execution is in non-blocking
285 |   mode, and the BSP returns from this service without waiting for APs. If a
286 |   non-blocking mode is requested after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT
287 |   is signaled, then EFI_UNSUPPORTED must be returned.
288 | 
289 |   If the timeout specified by TimeoutInMicroseconds expires before all APs return
290 |   from Procedure, then Procedure on the failed APs is terminated. All enabled APs
291 |   are always available for further calls to EFI_MP_SERVICES_PROTOCOL.StartupAllAPs()
292 |   and EFI_MP_SERVICES_PROTOCOL.StartupThisAP(). If FailedCpuList is not NULL, its
293 |   content points to the list of processor handle numbers in which Procedure was
294 |   terminated.
295 | 
296 |   Note: It is the responsibility of the consumer of the EFI_MP_SERVICES_PROTOCOL.StartupAllAPs()
297 |   to make sure that the nature of the code that is executed on the BSP and the
298 |   dispatched APs is well controlled. The MP Services Protocol does not guarantee
299 |   that the Procedure function is MP-safe. Hence, the tasks that can be run in
300 |   parallel are limited to certain independent tasks and well-controlled exclusive
301 |   code. EFI services and protocols may not be called by APs unless otherwise
302 |   specified.
303 | 
304 |   In blocking execution mode, BSP waits until all APs finish or
305 |   TimeoutInMicroSeconds expires.
306 | 
307 |   In non-blocking execution mode, BSP is freed to return to the caller and then
308 |   proceed to the next task without having to wait for APs. The following
309 |   sequence needs to occur in a non-blocking execution mode:
310 | 
311 |     -# The caller that intends to use this MP Services Protocol in non-blocking
312 |        mode creates WaitEvent by calling the EFI CreateEvent() service.  The caller
313 |        invokes EFI_MP_SERVICES_PROTOCOL.StartupAllAPs(). If the parameter WaitEvent
314 |        is not NULL, then StartupAllAPs() executes in non-blocking mode. It requests
315 |        the function specified by Procedure to be started on all the enabled APs,
316 |        and releases the BSP to continue with other tasks.
317 |     -# The caller can use the CheckEvent() and WaitForEvent() services to check
318 |        the state of the WaitEvent created in step 1.
319 |     -# When the APs complete their task or TimeoutInMicroSecondss expires, the MP
320 |        Service signals WaitEvent by calling the EFI SignalEvent() function. If
321 |        FailedCpuList is not NULL, its content is available when WaitEvent is
322 |        signaled. If all APs returned from Procedure prior to the timeout, then
323 |        FailedCpuList is set to NULL. If not all APs return from Procedure before
324 |        the timeout, then FailedCpuList is filled in with the list of the failed
325 |        APs. The buffer is allocated by MP Service Protocol using AllocatePool().
326 |        It is the caller's responsibility to free the buffer with FreePool() service.
327 |     -# This invocation of SignalEvent() function informs the caller that invoked
328 |        EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() that either all the APs completed
329 |        the specified task or a timeout occurred. The contents of FailedCpuList
330 |        can be examined to determine which APs did not complete the specified task
331 |        prior to the timeout.
332 | 
333 |   @param[in]  This                    A pointer to the EFI_MP_SERVICES_PROTOCOL
334 |                                       instance.
335 |   @param[in]  Procedure               A pointer to the function to be run on
336 |                                       enabled APs of the system. See type
337 |                                       EFI_AP_PROCEDURE.
338 |   @param[in]  SingleThread            If TRUE, then all the enabled APs execute
339 |                                       the function specified by Procedure one by
340 |                                       one, in ascending order of processor handle
341 |                                       number.  If FALSE, then all the enabled APs
342 |                                       execute the function specified by Procedure
343 |                                       simultaneously.
344 |   @param[in]  WaitEvent               The event created by the caller with CreateEvent()
345 |                                       service.  If it is NULL, then execute in
346 |                                       blocking mode. BSP waits until all APs finish
347 |                                       or TimeoutInMicroSeconds expires.  If it's
348 |                                       not NULL, then execute in non-blocking mode.
349 |                                       BSP requests the function specified by
350 |                                       Procedure to be started on all the enabled
351 |                                       APs, and go on executing immediately. If
352 |                                       all return from Procedure, or TimeoutInMicroSeconds
353 |                                       expires, this event is signaled. The BSP
354 |                                       can use the CheckEvent() or WaitForEvent()
355 |                                       services to check the state of event.  Type
356 |                                       EFI_EVENT is defined in CreateEvent() in
357 |                                       the Unified Extensible Firmware Interface
358 |                                       Specification.
359 |   @param[in]  TimeoutInMicrosecsond   Indicates the time limit in microseconds for
360 |                                       APs to return from Procedure, either for
361 |                                       blocking or non-blocking mode. Zero means
362 |                                       infinity.  If the timeout expires before
363 |                                       all APs return from Procedure, then Procedure
364 |                                       on the failed APs is terminated. All enabled
365 |                                       APs are available for next function assigned
366 |                                       by EFI_MP_SERVICES_PROTOCOL.StartupAllAPs()
367 |                                       or EFI_MP_SERVICES_PROTOCOL.StartupThisAP().
368 |                                       If the timeout expires in blocking mode,
369 |                                       BSP returns EFI_TIMEOUT.  If the timeout
370 |                                       expires in non-blocking mode, WaitEvent
371 |                                       is signaled with SignalEvent().
372 |   @param[in]  ProcedureArgument       The parameter passed into Procedure for
373 |                                       all APs.
374 |   @param[out] FailedCpuList           If NULL, this parameter is ignored. Otherwise,
375 |                                       if all APs finish successfully, then its
376 |                                       content is set to NULL. If not all APs
377 |                                       finish before timeout expires, then its
378 |                                       content is set to address of the buffer
379 |                                       holding handle numbers of the failed APs.
380 |                                       The buffer is allocated by MP Service Protocol,
381 |                                       and it's the caller's responsibility to
382 |                                       free the buffer with FreePool() service.
383 |                                       In blocking mode, it is ready for consumption
384 |                                       when the call returns. In non-blocking mode,
385 |                                       it is ready when WaitEvent is signaled.  The
386 |                                       list of failed CPU is terminated by
387 |                                       END_OF_CPU_LIST.
388 | 
389 |   @retval EFI_SUCCESS             In blocking mode, all APs have finished before
390 |                                   the timeout expired.
391 |   @retval EFI_SUCCESS             In non-blocking mode, function has been dispatched
392 |                                   to all enabled APs.
393 |   @retval EFI_UNSUPPORTED         A non-blocking mode request was made after the
394 |                                   UEFI event EFI_EVENT_GROUP_READY_TO_BOOT was
395 |                                   signaled.
396 |   @retval EFI_DEVICE_ERROR        Caller processor is AP.
397 |   @retval EFI_NOT_STARTED         No enabled APs exist in the system.
398 |   @retval EFI_NOT_READY           Any enabled APs are busy.
399 |   @retval EFI_TIMEOUT             In blocking mode, the timeout expired before
400 |                                   all enabled APs have finished.
401 |   @retval EFI_INVALID_PARAMETER   Procedure is NULL.
402 | 
403 | **/
404 | typedef
405 | EFI_STATUS
406 | (EFIAPI *EFI_MP_SERVICES_STARTUP_ALL_APS)(
407 |   IN  EFI_MP_SERVICES_PROTOCOL  *This,
408 |   IN  EFI_AP_PROCEDURE          Procedure,
409 |   IN  BOOLEAN                   SingleThread,
410 |   IN  EFI_EVENT                 WaitEvent               OPTIONAL,
411 |   IN  UINTN                     TimeoutInMicroSeconds,
412 |   IN  VOID                      *ProcedureArgument      OPTIONAL,
413 |   OUT UINTN                     **FailedCpuList         OPTIONAL
414 |   );
415 | 
416 | /**
417 |   This service lets the caller get one enabled AP to execute a caller-provided
418 |   function. The caller can request the BSP to either wait for the completion
419 |   of the AP or just proceed with the next task by using the EFI event mechanism.
420 |   See EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() for more details on non-blocking
421 |   execution support.  This service may only be called from the BSP.
422 | 
423 |   This function is used to dispatch one enabled AP to the function specified by
424 |   Procedure passing in the argument specified by ProcedureArgument.  If WaitEvent
425 |   is NULL, execution is in blocking mode. The BSP waits until the AP finishes or
426 |   TimeoutInMicroSecondss expires. Otherwise, execution is in non-blocking mode.
427 |   BSP proceeds to the next task without waiting for the AP. If a non-blocking mode
428 |   is requested after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT is signaled,
429 |   then EFI_UNSUPPORTED must be returned.
430 | 
431 |   If the timeout specified by TimeoutInMicroseconds expires before the AP returns
432 |   from Procedure, then execution of Procedure by the AP is terminated. The AP is
433 |   available for subsequent calls to EFI_MP_SERVICES_PROTOCOL.StartupAllAPs() and
434 |   EFI_MP_SERVICES_PROTOCOL.StartupThisAP().
435 | 
436 |   @param[in]  This                    A pointer to the EFI_MP_SERVICES_PROTOCOL
437 |                                       instance.
438 |   @param[in]  Procedure               A pointer to the function to be run on the
439 |                                       designated AP of the system. See type
440 |                                       EFI_AP_PROCEDURE.
441 |   @param[in]  ProcessorNumber         The handle number of the AP. The range is
442 |                                       from 0 to the total number of logical
443 |                                       processors minus 1. The total number of
444 |                                       logical processors can be retrieved by
445 |                                       EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors().
446 |   @param[in]  WaitEvent               The event created by the caller with CreateEvent()
447 |                                       service.  If it is NULL, then execute in
448 |                                       blocking mode. BSP waits until this AP finish
449 |                                       or TimeoutInMicroSeconds expires.  If it's
450 |                                       not NULL, then execute in non-blocking mode.
451 |                                       BSP requests the function specified by
452 |                                       Procedure to be started on this AP,
453 |                                       and go on executing immediately. If this AP
454 |                                       return from Procedure or TimeoutInMicroSeconds
455 |                                       expires, this event is signaled. The BSP
456 |                                       can use the CheckEvent() or WaitForEvent()
457 |                                       services to check the state of event.  Type
458 |                                       EFI_EVENT is defined in CreateEvent() in
459 |                                       the Unified Extensible Firmware Interface
460 |                                       Specification.
461 |   @param[in]  TimeoutInMicrosecsond   Indicates the time limit in microseconds for
462 |                                       this AP to finish this Procedure, either for
463 |                                       blocking or non-blocking mode. Zero means
464 |                                       infinity.  If the timeout expires before
465 |                                       this AP returns from Procedure, then Procedure
466 |                                       on the AP is terminated. The
467 |                                       AP is available for next function assigned
468 |                                       by EFI_MP_SERVICES_PROTOCOL.StartupAllAPs()
469 |                                       or EFI_MP_SERVICES_PROTOCOL.StartupThisAP().
470 |                                       If the timeout expires in blocking mode,
471 |                                       BSP returns EFI_TIMEOUT.  If the timeout
472 |                                       expires in non-blocking mode, WaitEvent
473 |                                       is signaled with SignalEvent().
474 |   @param[in]  ProcedureArgument       The parameter passed into Procedure on the
475 |                                       specified AP.
476 |   @param[out] Finished                If NULL, this parameter is ignored.  In
477 |                                       blocking mode, this parameter is ignored.
478 |                                       In non-blocking mode, if AP returns from
479 |                                       Procedure before the timeout expires, its
480 |                                       content is set to TRUE. Otherwise, the
481 |                                       value is set to FALSE. The caller can
482 |                                       determine if the AP returned from Procedure
483 |                                       by evaluating this value.
484 | 
485 |   @retval EFI_SUCCESS             In blocking mode, specified AP finished before
486 |                                   the timeout expires.
487 |   @retval EFI_SUCCESS             In non-blocking mode, the function has been
488 |                                   dispatched to specified AP.
489 |   @retval EFI_UNSUPPORTED         A non-blocking mode request was made after the
490 |                                   UEFI event EFI_EVENT_GROUP_READY_TO_BOOT was
491 |                                   signaled.
492 |   @retval EFI_DEVICE_ERROR        The calling processor is an AP.
493 |   @retval EFI_TIMEOUT             In blocking mode, the timeout expired before
494 |                                   the specified AP has finished.
495 |   @retval EFI_NOT_READY           The specified AP is busy.
496 |   @retval EFI_NOT_FOUND           The processor with the handle specified by
497 |                                   ProcessorNumber does not exist.
498 |   @retval EFI_INVALID_PARAMETER   ProcessorNumber specifies the BSP or disabled AP.
499 |   @retval EFI_INVALID_PARAMETER   Procedure is NULL.
500 | 
501 | **/
502 | typedef
503 | EFI_STATUS
504 | (EFIAPI *EFI_MP_SERVICES_STARTUP_THIS_AP)(
505 |   IN  EFI_MP_SERVICES_PROTOCOL  *This,
506 |   IN  EFI_AP_PROCEDURE          Procedure,
507 |   IN  UINTN                     ProcessorNumber,
508 |   IN  EFI_EVENT                 WaitEvent               OPTIONAL,
509 |   IN  UINTN                     TimeoutInMicroseconds,
510 |   IN  VOID                      *ProcedureArgument      OPTIONAL,
511 |   OUT BOOLEAN                   *Finished               OPTIONAL
512 |   );
513 | 
514 | /**
515 |   This service switches the requested AP to be the BSP from that point onward.
516 |   This service changes the BSP for all purposes.   This call can only be performed
517 |   by the current BSP.
518 | 
519 |   This service switches the requested AP to be the BSP from that point onward.
520 |   This service changes the BSP for all purposes. The new BSP can take over the
521 |   execution of the old BSP and continue seamlessly from where the old one left
522 |   off. This service may not be supported after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT
523 |   is signaled.
524 | 
525 |   If the BSP cannot be switched prior to the return from this service, then
526 |   EFI_UNSUPPORTED must be returned.
527 | 
528 |   @param[in] This              A pointer to the EFI_MP_SERVICES_PROTOCOL instance.
529 |   @param[in] ProcessorNumber   The handle number of AP that is to become the new
530 |                                BSP. The range is from 0 to the total number of
531 |                                logical processors minus 1. The total number of
532 |                                logical processors can be retrieved by
533 |                                EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors().
534 |   @param[in] EnableOldBSP      If TRUE, then the old BSP will be listed as an
535 |                                enabled AP. Otherwise, it will be disabled.
536 | 
537 |   @retval EFI_SUCCESS             BSP successfully switched.
538 |   @retval EFI_UNSUPPORTED         Switching the BSP cannot be completed prior to
539 |                                   this service returning.
540 |   @retval EFI_UNSUPPORTED         Switching the BSP is not supported.
541 |   @retval EFI_DEVICE_ERROR        The calling processor is an AP.
542 |   @retval EFI_NOT_FOUND           The processor with the handle specified by
543 |                                   ProcessorNumber does not exist.
544 |   @retval EFI_INVALID_PARAMETER   ProcessorNumber specifies the current BSP or
545 |                                   a disabled AP.
546 |   @retval EFI_NOT_READY           The specified AP is busy.
547 | 
548 | **/
549 | typedef
550 | EFI_STATUS
551 | (EFIAPI *EFI_MP_SERVICES_SWITCH_BSP)(
552 |   IN EFI_MP_SERVICES_PROTOCOL  *This,
553 |   IN  UINTN                    ProcessorNumber,
554 |   IN  BOOLEAN                  EnableOldBSP
555 |   );
556 | 
557 | /**
558 |   This service lets the caller enable or disable an AP from this point onward.
559 |   This service may only be called from the BSP.
560 | 
561 |   This service allows the caller enable or disable an AP from this point onward.
562 |   The caller can optionally specify the health status of the AP by Health. If
563 |   an AP is being disabled, then the state of the disabled AP is implementation
564 |   dependent. If an AP is enabled, then the implementation must guarantee that a
565 |   complete initialization sequence is performed on the AP, so the AP is in a state
566 |   that is compatible with an MP operating system. This service may not be supported
567 |   after the UEFI Event EFI_EVENT_GROUP_READY_TO_BOOT is signaled.
568 | 
569 |   If the enable or disable AP operation cannot be completed prior to the return
570 |   from this service, then EFI_UNSUPPORTED must be returned.
571 | 
572 |   @param[in] This              A pointer to the EFI_MP_SERVICES_PROTOCOL instance.
573 |   @param[in] ProcessorNumber   The handle number of AP.
574 |                                The range is from 0 to the total number of
575 |                                logical processors minus 1. The total number of
576 |                                logical processors can be retrieved by
577 |                                EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors().
578 |   @param[in] EnableAP          Specifies the new state for the processor for
579 |                                enabled, FALSE for disabled.
580 |   @param[in] HealthFlag        If not NULL, a pointer to a value that specifies
581 |                                the new health status of the AP. This flag
582 |                                corresponds to StatusFlag defined in
583 |                                EFI_MP_SERVICES_PROTOCOL.GetProcessorInfo(). Only
584 |                                the PROCESSOR_HEALTH_STATUS_BIT is used. All other
585 |                                bits are ignored.  If it is NULL, this parameter
586 |                                is ignored.
587 | 
588 |   @retval EFI_SUCCESS             The specified AP was enabled or disabled successfully.
589 |   @retval EFI_UNSUPPORTED         Enabling or disabling an AP cannot be completed
590 |                                   prior to this service returning.
591 |   @retval EFI_UNSUPPORTED         Enabling or disabling an AP is not supported.
592 |   @retval EFI_DEVICE_ERROR        The calling processor is an AP.
593 |   @retval EFI_NOT_FOUND           Processor with the handle specified by ProcessorNumber
594 |                                   does not exist.
595 |   @retval EFI_INVALID_PARAMETER   ProcessorNumber specifies the BSP.
596 | 
597 | **/
598 | typedef
599 | EFI_STATUS
600 | (EFIAPI *EFI_MP_SERVICES_ENABLEDISABLEAP)(
601 |   IN  EFI_MP_SERVICES_PROTOCOL  *This,
602 |   IN  UINTN                     ProcessorNumber,
603 |   IN  BOOLEAN                   EnableAP,
604 |   IN  UINT32                    *HealthFlag OPTIONAL
605 |   );
606 | 
607 | /**
608 |   This return the handle number for the calling processor.  This service may be
609 |   called from the BSP and APs.
610 | 
611 |   This service returns the processor handle number for the calling processor.
612 |   The returned value is in the range from 0 to the total number of logical
613 |   processors minus 1. The total number of logical processors can be retrieved
614 |   with EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors(). This service may be
615 |   called from the BSP and APs. If ProcessorNumber is NULL, then EFI_INVALID_PARAMETER
616 |   is returned. Otherwise, the current processors handle number is returned in
617 |   ProcessorNumber, and EFI_SUCCESS is returned.
618 | 
619 |   @param[in] This              A pointer to the EFI_MP_SERVICES_PROTOCOL instance.
620 |   @param[in] ProcessorNumber   Pointer to the handle number of AP.
621 |                                The range is from 0 to the total number of
622 |                                logical processors minus 1. The total number of
623 |                                logical processors can be retrieved by
624 |                                EFI_MP_SERVICES_PROTOCOL.GetNumberOfProcessors().
625 | 
626 |   @retval EFI_SUCCESS             The current processor handle number was returned
627 |                                   in ProcessorNumber.
628 |   @retval EFI_INVALID_PARAMETER   ProcessorNumber is NULL.
629 | 
630 | **/
631 | typedef
632 | EFI_STATUS
633 | (EFIAPI *EFI_MP_SERVICES_WHOAMI)(
634 |   IN EFI_MP_SERVICES_PROTOCOL  *This,
635 |   OUT UINTN                    *ProcessorNumber
636 |   );
637 | 
638 | ///
639 | /// When installed, the MP Services Protocol produces a collection of services
640 | /// that are needed for MP management.
641 | ///
642 | /// Before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, the module
643 | /// that produces this protocol is required to place all APs into an idle state
644 | /// whenever the APs are disabled or the APs are not executing code as requested
645 | /// through the StartupAllAPs() or StartupThisAP() services. The idle state of
646 | /// an AP before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled is
647 | /// implementation dependent.
648 | ///
649 | /// After the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, all the APs
650 | /// must be placed in the OS compatible CPU state as defined by the UEFI
651 | /// Specification. Implementations of this protocol may use the UEFI event
652 | /// EFI_EVENT_GROUP_READY_TO_BOOT to force APs into the OS compatible state as
653 | /// defined by the UEFI Specification. Modules that use this protocol must
654 | /// guarantee that all non-blocking mode requests on all APs have been completed
655 | /// before the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled. Since the
656 | /// order that event notification functions in the same event group are executed
657 | /// is not deterministic, an event of type EFI_EVENT_GROUP_READY_TO_BOOT cannot
658 | /// be used to guarantee that APs have completed their non-blocking mode requests.
659 | ///
660 | /// When the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, the StartAllAPs()
661 | /// and StartupThisAp() services must no longer support non-blocking mode requests.
662 | /// The support for SwitchBSP() and EnableDisableAP() may no longer be supported
663 | /// after this event is signaled. Since UEFI Applications and UEFI OS Loaders
664 | /// execute after the UEFI event EFI_EVENT_GROUP_READY_TO_BOOT is signaled, these
665 | /// UEFI images must be aware that the functionality of this protocol may be reduced.
666 | ///
667 | struct _EFI_MP_SERVICES_PROTOCOL {
668 |   EFI_MP_SERVICES_GET_NUMBER_OF_PROCESSORS    GetNumberOfProcessors;
669 |   EFI_MP_SERVICES_GET_PROCESSOR_INFO          GetProcessorInfo;
670 |   EFI_MP_SERVICES_STARTUP_ALL_APS             StartupAllAPs;
671 |   EFI_MP_SERVICES_STARTUP_THIS_AP             StartupThisAP;
672 |   EFI_MP_SERVICES_SWITCH_BSP                  SwitchBSP;
673 |   EFI_MP_SERVICES_ENABLEDISABLEAP             EnableDisableAP;
674 |   EFI_MP_SERVICES_WHOAMI                      WhoAmI;
675 | };
676 | 
677 | extern EFI_GUID  gEfiMpServiceProtocolGuid;
678 | 
679 | #endif
680 | 


--------------------------------------------------------------------------------