├── .gitignore ├── logging.c ├── tests ├── .gitignore ├── test_utils.h ├── meson.build └── test_libvhost.py ├── subprojects └── libblkio.wrap ├── meson_options.txt ├── .exrc ├── platform.c ├── ya.make ├── virtio ├── virtio_types.h ├── virtio_fs.h ├── virtio_fs_spec.h ├── virtio_spec.h ├── virtio_blk.h ├── virt_queue.h ├── virtio_fs.c ├── virtio_blk_spec.h ├── virtio_blk.c └── virt_queue.c ├── memlog.h ├── LICENSE ├── bio.h ├── README.md ├── memmap.h ├── server_internal.h ├── CMakeLists.txt ├── logging.h ├── meson.build ├── queue.h ├── include └── vhost │ ├── fs.h │ ├── types.h │ ├── server.h │ └── blockdev.h ├── .github └── workflows │ └── main.yaml ├── objref.h ├── memlog.c ├── catomic.h ├── fs.c ├── event.h ├── blockdev.c ├── vhost_spec.h ├── platform.h ├── vdev.h ├── docs ├── logo.svg └── architecture.md ├── server.c ├── memmap.c └── event.c /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | subprojects/libblkio/ 3 | -------------------------------------------------------------------------------- /logging.c: -------------------------------------------------------------------------------- 1 | #include "logging.h" 2 | 3 | log_function g_log_fn; 4 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | work/ 2 | libblkio/ 3 | .pytest_cache/ 4 | __pycache__/ -------------------------------------------------------------------------------- /subprojects/libblkio.wrap: -------------------------------------------------------------------------------- 1 | [wrap-git] 2 | url = https://gitlab.com/libblkio/libblkio.git/ 3 | revision = f1eabd1b 4 | -------------------------------------------------------------------------------- /meson_options.txt: -------------------------------------------------------------------------------- 1 | option('libblkio', type : 'feature', value : 'auto', description : 'Pull libblkio subproject (required for tests)') 2 | -------------------------------------------------------------------------------- /.exrc: -------------------------------------------------------------------------------- 1 | "VIM settings to match QEMU coding style. They are activated by adding the 2 | "following settings (without the " symbol) as last two lines in $HOME/.vimrc: 3 | "set secure 4 | "set exrc 5 | set expandtab 6 | set shiftwidth=4 7 | set smarttab 8 | -------------------------------------------------------------------------------- /platform.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "platform.h" 5 | 6 | int init_platform_page_size(void) 7 | { 8 | if (!platform_page_size) { 9 | long result = sysconf(_SC_PAGESIZE); 10 | if (result < 0) { 11 | return errno; 12 | } 13 | platform_page_size = result; 14 | } 15 | 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /ya.make: -------------------------------------------------------------------------------- 1 | LIBRARY(vhost-server) 2 | 3 | CFLAGS( 4 | -Wno-unused-parameter 5 | ) 6 | 7 | SRCS( 8 | blockdev.c 9 | event.c 10 | fs.c 11 | logging.c 12 | memlog.c 13 | memmap.c 14 | server.c 15 | vdev.c 16 | platform.c 17 | virtio/virt_queue.c 18 | virtio/virtio_blk.c 19 | virtio/virtio_fs.c 20 | ) 21 | 22 | ADDINCL( 23 | GLOBAL cloud/contrib/vhost/include 24 | cloud/contrib/vhost 25 | ) 26 | 27 | END() 28 | 29 | -------------------------------------------------------------------------------- /virtio/virtio_types.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Definitions from virtio spec version 1.0 3 | * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html. 4 | * 5 | * Type naming and style is preserved verbatim from virtio spec. 6 | */ 7 | 8 | #pragma once 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | typedef uint8_t u8; 15 | 16 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 17 | typedef uint16_t le16; 18 | typedef uint32_t le32; 19 | typedef uint64_t le64; 20 | #else 21 | # error Implement me 22 | #endif 23 | 24 | #ifdef __cplusplus 25 | } 26 | #endif 27 | -------------------------------------------------------------------------------- /memlog.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "memmap.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | struct vhd_memory_log; 12 | 13 | struct vhd_memory_log *vhd_memlog_new(size_t size, int fd, off_t offset); 14 | void vhd_memlog_free(struct vhd_memory_log *log); 15 | 16 | void vhd_mark_range_dirty(struct vhd_memory_log *log, 17 | struct vhd_memory_map *mm, void *ptr, size_t len); 18 | void vhd_mark_gpa_range_dirty(struct vhd_memory_log *log, uint64_t gpa, 19 | size_t len); 20 | 21 | #ifdef __cplusplus 22 | } 23 | #endif 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 YANDEX LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /bio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Internally used represenation of a block io request passed to and returned 3 | * from the block backend. 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "queue.h" 9 | #include "vhost/server.h" 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | struct vhd_vring; 16 | 17 | struct vhd_io { 18 | enum vhd_bdev_io_result status; 19 | struct vhd_vring *vring; 20 | 21 | void (*completion_handler)(struct vhd_io *io); 22 | 23 | TAILQ_ENTRY(vhd_io) submission_link; 24 | TAILQ_ENTRY(vhd_io) inflight_link; 25 | SLIST_ENTRY(vhd_io) completion_link; 26 | 27 | time_t ts; 28 | }; 29 | 30 | #ifdef __cplusplus 31 | } 32 | #endif 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Libvhost

4 |
5 | 6 | [![CI](https://github.com/yandex-cloud/yc-libvhost-server/actions/workflows/main.yaml/badge.svg)](https://github.com/yandex-cloud/yc-libvhost-server/actions/workflows/main.yaml) 7 | 8 | A library for building [vhost-user protocol](https://qemu-project.gitlab.io/qemu/interop/vhost-user.html) servers. 9 | 10 | ## Quickstart 11 | 12 | Building the project: 13 | ```bash 14 | CC=clang meson setup build 15 | ninja -C build 16 | ``` 17 | 18 | Running tests locally: 19 | ``` 20 | ninja test -C build 21 | ``` 22 | -------------------------------------------------------------------------------- /virtio/virtio_fs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "virtio_fs_spec.h" 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | struct virtio_fs_dev; 10 | struct virtio_virtq; 11 | 12 | struct vhd_fsdev_info; 13 | struct vhd_bio; 14 | struct vhd_guest_memory_map; 15 | 16 | #define VIRTIO_FS_DEFAULT_FEATURES ((uint64_t)( \ 17 | (1UL << VIRTIO_F_RING_INDIRECT_DESC) | \ 18 | (1UL << VIRTIO_F_VERSION_1))) 19 | 20 | /** 21 | * Virtio file system device context 22 | */ 23 | struct virtio_fs_dev { 24 | struct vhd_fsdev_info *fsdev; 25 | 26 | /* fs config data generated on init from fsdev */ 27 | struct virtio_fs_config config; 28 | }; 29 | 30 | /** 31 | * Init virtio fs device context from fsdev info 32 | */ 33 | int virtio_fs_init_dev( 34 | struct virtio_fs_dev *dev, 35 | struct vhd_fsdev_info *fsdev); 36 | 37 | /** 38 | * Dispatch requests from device virtq 39 | */ 40 | int virtio_fs_dispatch_requests(struct virtio_fs_dev *dev, 41 | struct virtio_virtq *vq); 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | -------------------------------------------------------------------------------- /tests/test_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "vhost/server.h" 8 | 9 | /* Normally we pass LOG_VERBOSITY from make */ 10 | #ifndef LOG_VERBOSITY 11 | # define LOG_VERBOSITY LOG_INFO 12 | #endif 13 | 14 | /* Log function for tests */ 15 | static const char *const log_level_str[] = { 16 | "ERROR", 17 | "WARNING", 18 | "INFO", 19 | "DEBUG", 20 | }; 21 | 22 | __attribute__((format(printf, 2, 3))) 23 | static inline void vhd_log_stderr(enum LogLevel level, const char *fmt, ...) 24 | { 25 | va_list args; 26 | va_start(args, fmt); 27 | if (level <= LOG_VERBOSITY) { 28 | char timestr[64]; 29 | struct timeval tv; 30 | 31 | gettimeofday(&tv, NULL); 32 | strftime(timestr, sizeof(timestr), "%F %T", localtime(&tv.tv_sec)); 33 | fprintf(stderr, "%s.%03ld [%8s] ", timestr, tv.tv_usec / 1000, 34 | log_level_str[level]); 35 | vfprintf(stderr, fmt, args); 36 | fprintf(stderr, "\n"); 37 | } 38 | va_end(args); 39 | } 40 | -------------------------------------------------------------------------------- /memmap.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | struct vhd_memory_map; 13 | 14 | struct vhd_memory_map *vhd_memmap_new(int (*map_cb)(void *, size_t), 15 | int (*unmap_cb)(void *, size_t)); 16 | struct vhd_memory_map *vhd_memmap_dup(struct vhd_memory_map *mm); 17 | struct vhd_memory_map *vhd_memmap_dup_remap(struct vhd_memory_map *mm); 18 | 19 | size_t vhd_memmap_max_memslots(void); 20 | 21 | int vhd_memmap_add_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva, 22 | size_t size, int fd, off_t offset, bool preserve_fd); 23 | int vhd_memmap_del_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva, 24 | size_t size); 25 | 26 | void vhd_memmap_ref(struct vhd_memory_map *mm); 27 | void vhd_memmap_unref(struct vhd_memory_map *mm); 28 | 29 | void *gpa_range_to_ptr(struct vhd_memory_map *mm, uint64_t gpa, size_t len); 30 | void *uva_to_ptr(struct vhd_memory_map *mm, uint64_t uva); 31 | #define TRANSLATION_FAILED ((uint64_t)-1) 32 | uint64_t ptr_to_gpa(struct vhd_memory_map *mm, void *ptr); 33 | 34 | #ifdef __cplusplus 35 | } 36 | #endif 37 | -------------------------------------------------------------------------------- /virtio/virtio_fs_spec.h: -------------------------------------------------------------------------------- 1 | /* 2 | * virtio-fs protocol definitions 3 | */ 4 | 5 | #pragma once 6 | 7 | #include "virtio_spec.h" 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | /* 14 | * Device configuration layout. 15 | */ 16 | struct VHD_PACKED virtio_fs_config { 17 | /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */ 18 | u8 tag[36]; 19 | 20 | /* Number of request queues exposed by the device. */ 21 | le32 num_request_queues; 22 | }; 23 | 24 | /* 25 | * Generic FUSE request in/out headers. 26 | * FIXME: these are duplicates of fuse_in_header/fuse_out_header, and should be 27 | * removed in favor of the latter. 28 | */ 29 | struct virtio_fs_in_header { 30 | le32 len; 31 | le32 opcode; 32 | le64 unique; 33 | le64 nodeid; 34 | le32 uid; 35 | le32 gid; 36 | le32 pid; 37 | le32 padding; 38 | }; 39 | 40 | struct virtio_fs_out_header { 41 | le32 len; 42 | le32 error; 43 | le64 unique; 44 | }; 45 | 46 | /* 47 | * Device operation request. 48 | * 49 | * Request is a variable sized structure: 50 | * struct virtio_fs_req { 51 | * // Device-readable part 52 | * struct virtio_fs_in_header in; 53 | * u8 datain[]; 54 | * 55 | * // Device-writable part 56 | * struct virtio_fs_out_header out; 57 | * u8 dataout[]; 58 | * }; 59 | */ 60 | 61 | #ifdef __cplusplus 62 | } 63 | #endif 64 | -------------------------------------------------------------------------------- /server_internal.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "vhost/server.h" 4 | 5 | struct vhd_io_handler; 6 | /* Add io handler to vhost control event loop */ 7 | struct vhd_io_handler *vhd_add_vhost_io_handler(int fd, int (*read)(void *), 8 | void *opaque); 9 | 10 | struct vhd_request_queue; 11 | /* Add io handler to request queue event loop */ 12 | struct vhd_io_handler *vhd_add_rq_io_handler(struct vhd_request_queue *rq, 13 | int fd, int (*read)(void *), 14 | void *opaque); 15 | 16 | struct vhd_vdev; 17 | struct vhd_io; 18 | struct vhd_vring; 19 | 20 | /** 21 | * Enqueue IO request 22 | */ 23 | int vhd_enqueue_request(struct vhd_request_queue *rq, 24 | struct vhd_io *io); 25 | 26 | void vhd_cancel_queued_requests(struct vhd_request_queue *rq, 27 | const struct vhd_vring *vring); 28 | 29 | /** 30 | * Run callback in request queue 31 | */ 32 | void vhd_run_in_rq(struct vhd_request_queue *rq, void (*cb)(void *), 33 | void *opaque); 34 | 35 | /* 36 | * Run callback in vhost control event loop 37 | */ 38 | void vhd_run_in_ctl(void (*cb)(void *), void *opaque); 39 | 40 | /* 41 | * Submit a work item onto vhost control event loop and wait till it's 42 | * finished. 43 | */ 44 | struct vhd_work; 45 | int vhd_submit_ctl_work_and_wait(void (*func)(struct vhd_work *, void *), 46 | void *opaque); 47 | 48 | bool vhd_in_ctl_thread(void); 49 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | 3 | project(yc-libvhost-server C) 4 | 5 | if(UNIX AND NOT APPLE) 6 | set(LINUX TRUE) 7 | endif() 8 | 9 | if(NOT LINUX) 10 | message(FATAL_ERROR "Unsupported platform") 11 | endif() 12 | 13 | set(LIBVHOST_LOG_VERBOSITY "LOG_INFO" CACHE STRING "Libvhost log verbosity") 14 | message("Compiler ${CMAKE_C_COMPILER}") 15 | message("Libvhost log verbosity: ${LIBVHOST_LOG_VERBOSITY}") 16 | 17 | add_library(vhost-server) 18 | add_compile_definitions(_GNU_SOURCE LOG_VERBOSITY=${LIBVHOST_LOG_VERBOSITY}) 19 | target_compile_options(vhost-server PRIVATE 20 | -Wall 21 | -Werror 22 | -Wextra 23 | -Wno-unused-parameter 24 | -g 25 | -O2 26 | 27 | # make these warnings non-fatal in gcc 28 | $<$: 29 | -Wno-error=unused-value 30 | -Wno-error=unused-result 31 | -Wno-error=strict-aliasing 32 | > 33 | 34 | # enable additional warnings to enforce coding standards 35 | -Wmissing-prototypes 36 | -Wmissing-declarations 37 | $<$: 38 | -Wmissing-variable-declarations 39 | -Wzero-length-array 40 | > 41 | $<$: 42 | -Wzero-length-bounds 43 | > 44 | ) 45 | target_include_directories(vhost-server PUBLIC 46 | include 47 | ) 48 | target_include_directories(vhost-server PRIVATE 49 | ./ 50 | ) 51 | target_sources(vhost-server PRIVATE 52 | blockdev.c 53 | event.c 54 | fs.c 55 | logging.c 56 | memlog.c 57 | memmap.c 58 | server.c 59 | vdev.c 60 | platform.c 61 | virtio/virt_queue.c 62 | virtio/virtio_blk.c 63 | virtio/virtio_fs.c 64 | ) 65 | -------------------------------------------------------------------------------- /tests/meson.build: -------------------------------------------------------------------------------- 1 | libaio = cc.find_library('aio', required: true) 2 | libpthread = cc.find_library('pthread', required: true) 3 | 4 | vhost_user_blk_test_server_includes = include_directories( 5 | '../' 6 | ) 7 | 8 | vhost_user_blk_test_server = executable( 9 | 'vhost-user-blk-test-server', 10 | 'vhost_user_blk_test_server.c', 11 | link_with: libvhost, 12 | dependencies: [libaio, libpthread], 13 | include_directories: [ 14 | vhost_user_blk_test_server_includes, 15 | libvhost_includes 16 | ] 17 | ) 18 | 19 | # If libblkio is disabled, we have no client to run 20 | # against vhost-user-blk-test-server, so nothing to do here. 21 | if not libblkio_proj.found() 22 | subdir_done() 23 | endif 24 | 25 | # If libblkio subproject doesn't define blkio_bench, this yields 26 | # a fatal error. It is OK as we pull a specific libblkio revision 27 | # which is known to define blkio_bench, and if it is missing, 28 | # something is certainly wrong. 29 | libblkio_bench_dep = libblkio_proj.get_variable('blkio_bench') 30 | 31 | envdata = environment() 32 | envdata.append( 33 | 'TEST_SERVER_BINARY', 34 | vhost_user_blk_test_server.full_path() 35 | ) 36 | envdata.append( 37 | 'BLKIO_BENCH_BINARY', 38 | libblkio_bench_dep.full_path() 39 | ) 40 | 41 | test( 42 | 'unit-tests', 43 | import('python').find_installation('python3', modules: ['pytest']), 44 | args: ['-m', 'pytest', '-rsv'], 45 | depends: [vhost_user_blk_test_server, libblkio_bench_dep], 46 | env: envdata, 47 | workdir: meson.current_source_dir(), 48 | timeout: 150, 49 | is_parallel: false, 50 | ) 51 | -------------------------------------------------------------------------------- /logging.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "platform.h" 4 | 5 | #include "vhost/server.h" 6 | 7 | extern log_function __attribute__((format(printf, 2, 3))) g_log_fn; 8 | 9 | #define VHD_LOG(level, fmt, ...) \ 10 | do { \ 11 | if (g_log_fn) { \ 12 | g_log_fn(level, "%s:%d: " fmt, \ 13 | __func__, __LINE__, ##__VA_ARGS__); \ 14 | } \ 15 | } while (0) 16 | 17 | #ifndef VHD_NO_DEBUG_LOGS 18 | # define VHD_LOG_DEBUG(fmt, ...) VHD_LOG(LOG_DEBUG, fmt, ##__VA_ARGS__) 19 | #else 20 | # define VHD_LOG_DEBUG(fmt, ...) 21 | #endif 22 | 23 | #define VHD_LOG_INFO(fmt, ...) VHD_LOG(LOG_INFO, fmt, ##__VA_ARGS__) 24 | #define VHD_LOG_WARN(fmt, ...) VHD_LOG(LOG_WARNING, fmt, ##__VA_ARGS__) 25 | #define VHD_LOG_ERROR(fmt, ...) VHD_LOG(LOG_ERROR, fmt, ##__VA_ARGS__) 26 | 27 | /* 28 | * Generic helpers to produce log messages tagged by an object. For that, the 29 | * object must provide duck-typed interface of ->log_tag field of type "const 30 | * char *". 31 | */ 32 | #define VHD_OBJ_DEBUG(obj, fmt, ...) \ 33 | VHD_LOG_DEBUG("%s: " fmt, obj->log_tag, ##__VA_ARGS__) 34 | #define VHD_OBJ_INFO(obj, fmt, ...) \ 35 | VHD_LOG_INFO("%s: " fmt, obj->log_tag, ##__VA_ARGS__) 36 | #define VHD_OBJ_WARN(obj, fmt, ...) \ 37 | VHD_LOG_WARN("%s: " fmt, obj->log_tag, ##__VA_ARGS__) 38 | #define VHD_OBJ_ERROR(obj, fmt, ...) \ 39 | VHD_LOG_ERROR("%s: " fmt, obj->log_tag, ##__VA_ARGS__) 40 | 41 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project( 2 | 'yc-libvhost-server', 'c' 3 | ) 4 | 5 | libvhost_log_verbosity = 'LOG_INFO' 6 | 7 | libvhost_includes = include_directories( 8 | 'include' 9 | ) 10 | 11 | libvhost_sources = files([ 12 | 'blockdev.c', 13 | 'event.c', 14 | 'fs.c', 15 | 'logging.c', 16 | 'memlog.c', 17 | 'memmap.c', 18 | 'server.c', 19 | 'vdev.c', 20 | 'platform.c', 21 | 'virtio/virtio_blk.c', 22 | 'virtio/virtio_fs.c', 23 | 'virtio/virt_queue.c' 24 | ]) 25 | 26 | libvhost_args = [ 27 | '-Wall', 28 | '-Werror', 29 | '-Wextra', 30 | '-Wno-unused-parameter', 31 | '-g', 32 | '-O2', 33 | ] 34 | 35 | cc = meson.get_compiler('c') 36 | 37 | if cc.get_id() == 'gcc' 38 | libvhost_args += [ 39 | '-Wno-error=unused-value', 40 | '-Wno-error=unused-result', 41 | '-Wno-error=strict-aliasing', 42 | ] 43 | endif 44 | 45 | libvhost_optional_args = cc.get_supported_arguments( 46 | '-Wmissing-prototypes', 47 | '-Wmissing-variable-declarations', 48 | '-Wzero-length-array', 49 | '-Wzero-length-bounds', 50 | ) 51 | 52 | libvhost_defines = [ 53 | '-D_GNU_SOURCE', 54 | '-DLOG_VERBOSITY=' + libvhost_log_verbosity 55 | ] 56 | 57 | libvhost = static_library( 58 | 'vhost', 59 | sources: libvhost_sources, 60 | include_directories: libvhost_includes, 61 | c_args: libvhost_args + libvhost_optional_args + libvhost_defines, 62 | ) 63 | 64 | libblkio_proj = subproject( 65 | 'libblkio', 66 | default_options: [ 67 | 'subproject-docs=disabled', 68 | 'subproject-examples=enabled', # used in libvhost tests 69 | 'subproject-tests=disabled' 70 | ], 71 | required: get_option('libblkio') 72 | ) 73 | subdir('tests') 74 | -------------------------------------------------------------------------------- /queue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Lists and queues. 3 | * 4 | * Relevant OSes provide BSD-originated sys/queue.h, so just use it here, with 5 | * a few extensions. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | #include "catomic.h" 12 | 13 | /* 14 | * Atomically insert a new list head 15 | */ 16 | #define SLIST_INSERT_HEAD_ATOMIC(head, elm, field) ({ \ 17 | typeof(elm) old_slh_first; \ 18 | do { \ 19 | /* Grab the current head and make the new element point to it */ \ 20 | (elm)->field.sle_next = catomic_read(&(head)->slh_first); \ 21 | old_slh_first = (elm)->field.sle_next; \ 22 | \ 23 | /* Repeat until slh_first matches old_slh_first at the time of cmpxchg */ \ 24 | } while (catomic_cmpxchg(&(head)->slh_first, old_slh_first, (elm)) != \ 25 | old_slh_first); \ 26 | old_slh_first; }) 27 | 28 | /* 29 | * Atomically move the list into 'dest' leaving 'src' empty 30 | */ 31 | #define SLIST_MOVE_ATOMIC(dest, src) do { \ 32 | (dest)->slh_first = catomic_xchg(&(src)->slh_first, NULL); \ 33 | } while (0) 34 | 35 | /* 36 | * Read the current list head with consume 37 | */ 38 | #define SLIST_FIRST_RCU(head) catomic_rcu_read(&(head)->slh_first) 39 | 40 | #define LIST_FOREACH_SAFE(elm, head, field, tmp_elm) \ 41 | for ((elm) = ((head)->lh_first); \ 42 | (elm) && ((tmp_elm) = LIST_NEXT((elm), field), 1); \ 43 | (elm) = (tmp_elm)) 44 | -------------------------------------------------------------------------------- /include/vhost/fs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "vhost/types.h" 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | struct vhd_io; 10 | struct vhd_request_queue; 11 | struct vhd_vdev; 12 | 13 | /** 14 | * Client-supplied file system definition. 15 | */ 16 | struct vhd_fsdev_info { 17 | /* Path to create listen sockets */ 18 | const char *socket_path; 19 | 20 | /* Device tag (file system name visible to the guest) */ 21 | const char *tag; 22 | 23 | /* Total number of backend queues this device supports */ 24 | uint32_t num_queues; 25 | }; 26 | 27 | /** 28 | * In-flight file system io request 29 | */ 30 | struct vhd_fs_io { 31 | struct vhd_sglist sglist; 32 | }; 33 | 34 | struct vhd_fs_io *vhd_get_fs_io(struct vhd_io *io); 35 | 36 | /** 37 | * Register vhost file system. 38 | * 39 | * After registering device will be accessible through vhost socket to client. 40 | * All requests are submitted to attacher request queue for caller to process. 41 | * 42 | * @fsdev Caller file system device info. 43 | * @rq Request queue to use for dispatch device I/O requests. 44 | * @priv Caller private data to associate with resulting vdev. 45 | */ 46 | struct vhd_vdev *vhd_register_fs(struct vhd_fsdev_info *fsdev, 47 | struct vhd_request_queue *rq, 48 | void *priv); 49 | 50 | struct vhd_vdev *vhd_register_fs_mq(struct vhd_fsdev_info *fsdev, 51 | struct vhd_request_queue **rqs, 52 | int num_rqs, 53 | void *priv); 54 | 55 | /** 56 | * Unregister vhost file system. 57 | */ 58 | void vhd_unregister_fs(struct vhd_vdev *vdev, 59 | void (*unregister_complete)(void *), 60 | void *arg); 61 | 62 | #ifdef __cplusplus 63 | } 64 | #endif 65 | -------------------------------------------------------------------------------- /include/vhost/types.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Common types' definitions 3 | */ 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | struct vhd_buffer { 17 | void *base; 18 | size_t len; 19 | 20 | /* Buffer is write-only if true and read-only if false */ 21 | bool write_only; 22 | }; 23 | 24 | struct vhd_sglist { 25 | uint32_t nbuffers; 26 | struct vhd_buffer *buffers; 27 | }; 28 | 29 | /** 30 | * virtqueue usage statistics 31 | */ 32 | struct vhd_vq_metrics { 33 | /* Dispatch counters */ 34 | /* number of times vring was processed */ 35 | uint64_t dispatch_total; 36 | /* number of times vring was empty on processing */ 37 | uint64_t dispatch_empty; 38 | 39 | /* Request counters */ 40 | /* total amount of requests processed */ 41 | uint64_t request_total; 42 | /* total amount of requests completed */ 43 | uint64_t request_completed; 44 | 45 | /* Other counters*/ 46 | /* number of requests was dispatched from vring last time*/ 47 | uint16_t queue_len_last; 48 | /* max queue len was processed during 60s period */ 49 | uint16_t queue_len_max_60s; 50 | }; 51 | 52 | /** 53 | * request queue usage statistics 54 | */ 55 | struct vhd_rq_metrics { 56 | /* number of requests read from guest and put to internal queue */ 57 | uint64_t enqueued; 58 | /* number of requests dispatched for handling */ 59 | uint64_t dequeued; 60 | /* number of requests completed externally and scheduled for completion in rq */ 61 | uint64_t completions_received; 62 | /* number of requests completed and reported to guest */ 63 | uint64_t completed; 64 | /* number of requests canceled from internal queue before dispatch */ 65 | uint64_t cancelled; 66 | 67 | /* timestamp of oldest infight request */ 68 | time_t oldest_inflight_ts; 69 | }; 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | build-and-run-tests: 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - name: Checkout the repository 10 | uses: actions/checkout@v3 11 | 12 | - name: Install libraries 13 | run: | 14 | sudo apt update 15 | sudo apt install python3 python3-docutils meson clang libaio-dev rustc cargo 16 | python3 -m pip install pytest 17 | rustup default 1.89.0 18 | rustup component add clippy 19 | 20 | - name: Build 21 | run: | 22 | src_dir="${{ github.workspace }}" 23 | build_dir="$src_dir/build" 24 | CC=clang meson setup $build_dir $src_dir 25 | pushd $build_dir 26 | ninja 27 | popd 28 | 29 | - name: Run Tests 30 | run: | 31 | python3 -m pytest ${{ github.workspace }}/tests/test_libvhost.py -rsv --junitxml result.xml 32 | 33 | - name: Collect test results 34 | uses: mikepenz/action-junit-report@v3 35 | if: always() 36 | with: 37 | report_paths: result.xml 38 | build-with-cmake: 39 | runs-on: ubuntu-latest 40 | 41 | steps: 42 | - name: Checkout the repository 43 | uses: actions/checkout@v3 44 | 45 | - name: Install libraries 46 | run: | 47 | sudo apt update 48 | sudo apt install cmake ninja-build clang 49 | 50 | - name: Build 51 | run: | 52 | src_dir="${{ github.workspace }}" 53 | build_dir="$src_dir/build" 54 | cmake -S $src_dir -B $build_dir -G Ninja -DCMAKE_C_COMPILER=clang 55 | ninja -C $build_dir 56 | lint-python-scripts: 57 | runs-on: ubuntu-latest 58 | 59 | strategy: 60 | fail-fast: true 61 | 62 | steps: 63 | - uses: actions/checkout@v3 64 | 65 | - name: Install flake8 & mypy 66 | run: | 67 | sudo apt update 68 | sudo apt install python3 python3-pip 69 | pip install flake8 mypy pytest 70 | 71 | - name: Run flake8 72 | run: flake8 tests/*.py 73 | 74 | - name: Run mypy 75 | run: mypy --disallow-incomplete-defs --no-implicit-optional tests/*.py 76 | -------------------------------------------------------------------------------- /objref.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Generic reference counting infrastructure. 3 | * 4 | * Note: refcounts need more relaxed memory ordering that regular atomics. 5 | * 6 | * The increments provide no ordering, because it's expected that the object is 7 | * held by something else that provides ordering. 8 | * 9 | * The decrements provide release order, such that all the prior loads and 10 | * stores will be issued before, it also provides a control dependency, which 11 | * will order against the subsequent free(). 12 | * 13 | * The control dependency is against the load of the cmpxchg (ll/sc) that 14 | * succeeded. This means the stores aren't fully ordered, but this is fine 15 | * because the 1->0 transition indicates no concurrency. 16 | * 17 | * The decrements dec_and_test() and sub_and_test() also provide acquire 18 | * ordering on success. 19 | */ 20 | 21 | #pragma once 22 | 23 | #include 24 | #include "catomic.h" 25 | #include "platform.h" 26 | 27 | struct objref { 28 | unsigned long refcount; 29 | void (*release)(struct objref *objref); 30 | }; 31 | 32 | static inline void objref_init(struct objref *objref, 33 | void (*release)(struct objref *objref)) 34 | { 35 | objref->release = release; 36 | catomic_set(&objref->refcount, 1); 37 | } 38 | 39 | static inline unsigned int objref_read(struct objref *objref) 40 | { 41 | return catomic_read(&objref->refcount); 42 | } 43 | 44 | static inline void refcount_inc(unsigned long *ptr) 45 | { 46 | __atomic_fetch_add(ptr, 1, __ATOMIC_RELAXED); 47 | } 48 | 49 | static inline void objref_get(struct objref *objref) 50 | { 51 | refcount_inc(&objref->refcount); 52 | } 53 | 54 | static inline bool refcount_dec_and_test(unsigned long *ptr) 55 | { 56 | const int memory_order = 57 | #if VHD_HAS_FEATURE(thread_sanitizer) 58 | __ATOMIC_ACQ_REL; 59 | #else 60 | __ATOMIC_RELEASE; 61 | #endif 62 | unsigned long old = __atomic_fetch_sub(ptr, 1, memory_order); 63 | 64 | if (old == 1) { 65 | smp_mb_acquire(); 66 | return true; 67 | } 68 | return false; 69 | } 70 | 71 | /* 72 | * Decrement refcount for object, and call @release if it drops to zero. 73 | * Return true if the object was removed, otherwise return false. 74 | * Note: only "true" is trustworthy, "false" doesn't prevent another thread 75 | * from releasing the object. 76 | */ 77 | static inline bool objref_put(struct objref *objref) 78 | { 79 | if (refcount_dec_and_test(&objref->refcount)) { 80 | objref->release(objref); 81 | return true; 82 | } 83 | return false; 84 | } 85 | -------------------------------------------------------------------------------- /virtio/virtio_spec.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Definitions from virtio spec version 1.0 3 | * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html. 4 | * 5 | * Type naming and style is preserved verbatim from virtio spec. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include "platform.h" 11 | #include "virtio_types.h" 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | #define VIRTQ_SIZE_MAX 32768u 18 | 19 | struct virtq_desc { 20 | /* Address (guest-physical). */ 21 | le64 addr; 22 | /* Length. */ 23 | le32 len; 24 | 25 | /* This marks a buffer as continuing via the next field. */ 26 | #define VIRTQ_DESC_F_NEXT 1 27 | /* This marks a buffer as device write-only (otherwise device read-only). */ 28 | #define VIRTQ_DESC_F_WRITE 2 29 | /* This means the buffer contains a list of buffer descriptors. */ 30 | #define VIRTQ_DESC_F_INDIRECT 4 31 | /* The flags as indicated above. */ 32 | le16 flags; 33 | /* Next field if flags & NEXT */ 34 | le16 next; 35 | }; 36 | VHD_STATIC_ASSERT(sizeof(struct virtq_desc) == 16); 37 | 38 | struct virtq_avail { 39 | #define VIRTQ_AVAIL_F_NO_INTERRUPT 1 40 | le16 flags; 41 | le16 idx; 42 | le16 ring[]; /* Queue Size */ 43 | /* le16 used_event; Only if VIRTIO_F_EVENT_IDX */ 44 | }; 45 | VHD_STATIC_ASSERT(sizeof(struct virtq_avail) == 4); 46 | 47 | /* le32 is used here for ids for padding reasons. */ 48 | struct virtq_used_elem { 49 | /* Index of start of used descriptor chain. */ 50 | le32 id; 51 | /* 52 | * The number of bytes written into the device writable portion of 53 | * the buffer described by the descriptor chain. 54 | */ 55 | le32 len; 56 | }; 57 | VHD_STATIC_ASSERT(sizeof(struct virtq_used_elem) == 8); 58 | 59 | struct virtq_used { 60 | #define VIRTQ_USED_F_NO_NOTIFY 1 61 | le16 flags; 62 | le16 idx; 63 | struct virtq_used_elem ring[]; /* Queue Size */ 64 | /* le16 avail_event; Only if VIRTIO_F_EVENT_IDX */ 65 | }; 66 | VHD_STATIC_ASSERT(sizeof(struct virtq_used) == 4); 67 | 68 | /* 69 | * Virtqueue layout cannot be represented by a C struct, 70 | * definition below is intentionally a comment. 71 | * struct virtq { 72 | * // The actual descriptors (16 bytes each) 73 | * struct virtq_desc desc[ Queue Size ]; 74 | * 75 | * // A ring of available descriptor heads with free-running index. 76 | * struct virtq_avail avail; 77 | * le16 used_event; // Only if VIRTIO_F_EVENT_IDX 78 | * 79 | * // Padding to the next PAGE_SIZE boundary. 80 | * u8 pad[ Padding ]; 81 | * 82 | * // A ring of used descriptor heads with free-running index. 83 | * struct virtq_used used; 84 | * le16 avail_event; // Only if VIRTIO_F_EVENT_IDX 85 | * }; 86 | */ 87 | 88 | static inline size_t virtq_align(size_t size) 89 | { 90 | return (size + platform_page_size) & ~platform_page_size; 91 | } 92 | 93 | static inline unsigned virtq_size(unsigned int qsz) 94 | { 95 | return virtq_align(sizeof(struct virtq_desc) * qsz + 96 | sizeof(le16) * (3 + qsz)) 97 | + virtq_align(sizeof(le16) * 3 + 98 | sizeof(struct virtq_used_elem) * qsz); 99 | } 100 | 101 | #ifdef __cplusplus 102 | } 103 | #endif 104 | -------------------------------------------------------------------------------- /memlog.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "catomic.h" 5 | #include "logging.h" 6 | #include "memlog.h" 7 | #include "memmap.h" 8 | 9 | struct vhd_memory_log { 10 | unsigned long *base; 11 | size_t size; 12 | }; 13 | 14 | struct vhd_memory_log *vhd_memlog_new(size_t size, int fd, off_t offset) 15 | { 16 | struct vhd_memory_log *log; 17 | void *base; 18 | 19 | base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset); 20 | if (base == MAP_FAILED) { 21 | VHD_LOG_ERROR("mmap(%zu, %d, %zu): %s", size, fd, offset, 22 | strerror(errno)); 23 | return NULL; 24 | } 25 | 26 | log = vhd_alloc(sizeof(*log)); 27 | *log = (struct vhd_memory_log) { 28 | .base = base, 29 | .size = size, 30 | }; 31 | return log; 32 | } 33 | 34 | void vhd_memlog_free(struct vhd_memory_log *log) 35 | { 36 | munmap(log->base, log->size); 37 | vhd_free(log); 38 | } 39 | 40 | static void atomic_or_le_ulong(unsigned long *ptr, unsigned long mask) 41 | { 42 | VHD_STATIC_ASSERT(sizeof(*ptr) == sizeof(uint64_t)); 43 | catomic_or(ptr, htole64(mask)); 44 | } 45 | 46 | static void bitmap_set_atomic(unsigned long *map, size_t start, size_t end) 47 | { 48 | static const unsigned bits_per_word = sizeof(*map) * 8; 49 | size_t start_idx = start / bits_per_word; 50 | size_t end_idx = end / bits_per_word; 51 | size_t i; 52 | unsigned start_in_word = start % bits_per_word; 53 | unsigned end_in_word = end % bits_per_word; 54 | 55 | /* first partial word */ 56 | if (start_in_word && start_idx < end_idx) { 57 | atomic_or_le_ulong(&map[start_idx], ~0UL << start_in_word); 58 | start_in_word = 0; 59 | start_idx++; 60 | } 61 | 62 | /* full words: no RMW so relaxed atomic; no endianness */ 63 | for (i = start_idx; i < end_idx; i++) { 64 | catomic_set(&map[i], ~0UL); 65 | } 66 | 67 | /* last partial word */ 68 | if (end_in_word) { 69 | unsigned nr_clear_bits = bits_per_word - (end_in_word - start_in_word); 70 | atomic_or_le_ulong(&map[end_idx], 71 | (~0UL >> nr_clear_bits) << start_in_word); 72 | } else if (start_idx < end_idx) { 73 | /* 74 | * if there were any relaxed catomic_set's not followed by an implicit 75 | * full memory barrier in catomic_or, do an explicit one 76 | */ 77 | smp_mb(); 78 | } 79 | } 80 | 81 | #define VHOST_LOG_PAGE 0x1000 82 | 83 | void vhd_mark_gpa_range_dirty(struct vhd_memory_log *log, uint64_t gpa, 84 | size_t len) 85 | { 86 | size_t start = gpa / VHOST_LOG_PAGE; 87 | size_t end = (gpa + len - 1) / VHOST_LOG_PAGE + 1; 88 | 89 | /* this is internal function, overflown ranges shouldn't reach here */ 90 | VHD_ASSERT(gpa + len > gpa); 91 | 92 | if (end > log->size * 8) { 93 | VHD_LOG_ERROR("range 0x%zx-0x%zx beyond log size %zx", gpa, 94 | gpa + len - 1, log->size); 95 | end = log->size * 8; 96 | } 97 | 98 | bitmap_set_atomic(log->base, start, end); 99 | } 100 | 101 | void vhd_mark_range_dirty(struct vhd_memory_log *log, 102 | struct vhd_memory_map *mm, void *ptr, size_t len) 103 | { 104 | uint64_t gpa = ptr_to_gpa(mm, ptr); 105 | if (gpa != TRANSLATION_FAILED) { 106 | vhd_mark_gpa_range_dirty(log, gpa, len); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /virtio/virtio_blk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "virtio_blk_spec.h" 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #define VIRTIO_BLK_DEFAULT_FEATURES ((uint64_t)( \ 10 | (1UL << VIRTIO_F_RING_INDIRECT_DESC) | \ 11 | (1UL << VIRTIO_F_RING_EVENT_IDX) | \ 12 | (1UL << VIRTIO_F_VERSION_1) | \ 13 | (1UL << VIRTIO_BLK_F_SEG_MAX) | \ 14 | (1UL << VIRTIO_BLK_F_GEOMETRY) | \ 15 | (1UL << VIRTIO_BLK_F_BLK_SIZE) | \ 16 | (1UL << VIRTIO_BLK_F_TOPOLOGY) | \ 17 | (1UL << VIRTIO_BLK_F_MQ))) 18 | 19 | /* 20 | * TODO: can implement size_max and seg_max to better control request limits 21 | * (1UL << VIRTIO_BLK_F_SIZE_MAX) | \ 22 | */ 23 | 24 | /* 25 | * Same as QEMU: 26 | * We support only one segment per request since multiple segments 27 | * are not widely used and there are no userspace APIs that allow 28 | * applications to submit multiple segments in a single call. 29 | */ 30 | #define VIRTIO_BLK_MAX_DISCARD_SEGMENTS 1 31 | #define VIRTIO_BLK_MAX_WRITE_ZEROES_SEGMENTS 1 32 | 33 | /* 34 | * The config field is an 'le32', we just set it to the maximum 35 | * possible value as we don't really have any reasons to limit 36 | * it to a lower number here. 37 | */ 38 | #define VIRTIO_BLK_MAX_DISCARD_SECTORS UINT32_MAX 39 | #define VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS UINT32_MAX 40 | 41 | struct vhd_bdev_info; 42 | struct vhd_io; 43 | 44 | struct virtio_virtq; 45 | struct virtio_blk_dev; 46 | 47 | /** 48 | * Virtio block I/O dispatch function, 49 | * can be overriden for testing. 50 | */ 51 | __attribute__((weak)) 52 | int virtio_blk_handle_request(struct virtio_virtq *vq, 53 | struct vhd_io *io); 54 | 55 | /** 56 | * Virtio block device context 57 | */ 58 | struct virtio_blk_dev { 59 | char *serial; 60 | uint64_t features; 61 | 62 | /* blk config data generated on init from bdev */ 63 | struct virtio_blk_config config; 64 | }; 65 | 66 | /** 67 | * Init virtio blk device context from bdev info 68 | */ 69 | void virtio_blk_init_dev( 70 | struct virtio_blk_dev *dev, 71 | const struct vhd_bdev_info *bdev); 72 | 73 | /** 74 | * Destroy virtio blk device context 75 | */ 76 | void virtio_blk_destroy_dev(struct virtio_blk_dev *dev); 77 | 78 | /** 79 | * Dispatch requests from device virtq 80 | */ 81 | int virtio_blk_dispatch_requests(struct virtio_blk_dev *dev, 82 | struct virtio_virtq *vq); 83 | 84 | /** 85 | * Get the virtio config 86 | */ 87 | size_t virtio_blk_get_config(struct virtio_blk_dev *dev, void *cfgbuf, 88 | size_t bufsize, size_t offset); 89 | 90 | /** 91 | * Get all supported virtio features 92 | */ 93 | uint64_t virtio_blk_get_features(struct virtio_blk_dev *dev); 94 | 95 | /** 96 | * Check if @dev supports a given virtio feature. 97 | * @feature is the bit index, and not the mask 98 | */ 99 | bool virtio_blk_has_feature(struct virtio_blk_dev *dev, int feature); 100 | 101 | /** 102 | * Get readonly status 103 | */ 104 | bool virtio_blk_is_readonly(struct virtio_blk_dev *dev); 105 | 106 | /** 107 | * Get total_blocks 108 | */ 109 | uint64_t virtio_blk_get_total_blocks(struct virtio_blk_dev *dev); 110 | 111 | /** 112 | * Update virtio config for new @total_blocks. 113 | */ 114 | void virtio_blk_set_total_blocks(struct virtio_blk_dev *dev, 115 | uint64_t total_blocks); 116 | 117 | #ifdef __cplusplus 118 | } 119 | #endif 120 | -------------------------------------------------------------------------------- /catomic.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | * This is a compiler barrier, it doesn't prevent the CPU from reordering loads 5 | * and stores in any way, but prevents compiler optimizations such as 6 | * reordering code around. Mostly used internally by this header to make other 7 | * helpers fully atomic. 8 | */ 9 | #define barrier() __atomic_signal_fence(__ATOMIC_ACQ_REL) 10 | 11 | /* 12 | * Reportedly __atomic_thread_fence does not include a compiler barrier, so add 13 | * one here. 14 | */ 15 | #define smp_mb() \ 16 | ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); }) 17 | #define smp_mb_release() \ 18 | ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); }) 19 | #define smp_mb_acquire() \ 20 | ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); }) 21 | 22 | /* 23 | * Reportedly current compilers promote consume order to acquire and 24 | * slow this down unnecessarily. This seems not to be the case on x86_64; need 25 | * to recheck if we ever build for another arch. 26 | */ 27 | #if !defined(__x86_64__) && !defined(__aarch64__) 28 | #error Verify smp_read_barrier_depends incurs no extra costs 29 | #endif 30 | #define smp_read_barrier_depends() \ 31 | ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); }) 32 | 33 | #define smp_wmb() smp_mb_release() 34 | #define smp_rmb() smp_mb_acquire() 35 | 36 | #define catomic_read(ptr) __atomic_load_n(ptr, __ATOMIC_RELAXED) 37 | #define catomic_set(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED) 38 | 39 | #define catomic_load_acquire(ptr) \ 40 | __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 41 | #define catomic_store_release(ptr, val) \ 42 | __atomic_store_n(ptr, val, __ATOMIC_RELEASE) 43 | 44 | /* 45 | * catomic_rcu_read potentially has the same issue with consume order as 46 | * smp_read_barrier_depends, see above. 47 | */ 48 | #if !defined(__x86_64__) && !defined(__aarch64__) 49 | #error Verify catomic_rcu_read incurs no extra costs 50 | #endif 51 | #define catomic_rcu_read(ptr) __atomic_load_n(ptr, __ATOMIC_CONSUME) 52 | #define catomic_rcu_set(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE) 53 | 54 | #define catomic_xchg(ptr, val) \ 55 | __atomic_exchange_n(ptr, val, __ATOMIC_SEQ_CST) 56 | #define catomic_cmpxchg(ptr, old, new) ({ \ 57 | __auto_type _old = (old); \ 58 | (void) __atomic_compare_exchange_n(ptr, &_old, new, false, \ 59 | __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ 60 | _old; }) 61 | 62 | #define catomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) 63 | #define catomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST) 64 | #define catomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST) 65 | #define catomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) 66 | #define catomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST) 67 | 68 | #define catomic_fetch_inc(ptr) catomic_fetch_add(ptr, 1) 69 | #define catomic_fetch_dec(ptr) catomic_fetch_sub(ptr, 1) 70 | 71 | #define catomic_add(ptr, n) ((void) catomic_fetch_add(ptr, n)) 72 | #define catomic_sub(ptr, n) ((void) catomic_fetch_sub(ptr, n)) 73 | #define catomic_and(ptr, n) ((void) catomic_fetch_and(ptr, n)) 74 | #define catomic_or(ptr, n) ((void) catomic_fetch_or(ptr, n)) 75 | #define catomic_xor(ptr, n) ((void) catomic_fetch_xor(ptr, n)) 76 | #define catomic_inc(ptr) ((void) catomic_fetch_inc(ptr)) 77 | #define catomic_dec(ptr) ((void) catomic_fetch_dec(ptr)) 78 | -------------------------------------------------------------------------------- /fs.c: -------------------------------------------------------------------------------- 1 | #include "vhost/fs.h" 2 | #include "virtio/virtio_fs.h" 3 | 4 | #include "bio.h" 5 | #include "logging.h" 6 | #include "server_internal.h" 7 | #include "vdev.h" 8 | 9 | /******************************************************************************/ 10 | 11 | struct vhd_fsdev { 12 | /* Base vdev */ 13 | struct vhd_vdev vdev; 14 | 15 | /* VM-facing interface type */ 16 | struct virtio_fs_dev vfs; 17 | 18 | LIST_ENTRY(vhd_fsdev) fsdevs; 19 | }; 20 | 21 | static LIST_HEAD(, vhd_fsdev) g_fsdev_list = LIST_HEAD_INITIALIZER(g_fsdev_list); 22 | 23 | #define VHD_FSDEV_FROM_VDEV(ptr) containerof(ptr, struct vhd_fsdev, vdev) 24 | 25 | /******************************************************************************/ 26 | 27 | static uint64_t vfs_get_features(struct vhd_vdev *vdev) 28 | { 29 | return VIRTIO_FS_DEFAULT_FEATURES; 30 | } 31 | 32 | static int vfs_set_features(struct vhd_vdev *vdev, uint64_t features) 33 | { 34 | return 0; 35 | } 36 | 37 | static size_t vfs_get_config(struct vhd_vdev *vdev, void *cfgbuf, 38 | size_t bufsize, size_t offset) 39 | { 40 | struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev); 41 | 42 | if (offset >= sizeof(dev->vfs.config)) { 43 | return 0; 44 | } 45 | 46 | size_t data_size = MIN(bufsize, sizeof(dev->vfs.config) - offset); 47 | 48 | memcpy(cfgbuf, (char *)(&dev->vfs.config) + offset, data_size); 49 | 50 | return data_size; 51 | } 52 | 53 | static int vfs_dispatch_requests(struct vhd_vdev *vdev, 54 | struct vhd_vring *vring) 55 | { 56 | struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev); 57 | return virtio_fs_dispatch_requests(&dev->vfs, &vring->vq); 58 | } 59 | 60 | static void vfs_free(struct vhd_vdev *vdev) 61 | { 62 | struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev); 63 | 64 | LIST_REMOVE(dev, fsdevs); 65 | vhd_free(dev); 66 | } 67 | 68 | static const struct vhd_vdev_type g_virtio_fs_vdev_type = { 69 | .desc = "virtio-fs", 70 | .get_features = vfs_get_features, 71 | .set_features = vfs_set_features, 72 | .get_config = vfs_get_config, 73 | .dispatch_requests = vfs_dispatch_requests, 74 | .free = vfs_free, 75 | }; 76 | 77 | /******************************************************************************/ 78 | 79 | struct vhd_vdev *vhd_register_fs(struct vhd_fsdev_info *fsdev, 80 | struct vhd_request_queue *rq, 81 | void *priv) 82 | { 83 | return vhd_register_fs_mq(fsdev, &rq, 1, priv); 84 | } 85 | 86 | struct vhd_vdev *vhd_register_fs_mq(struct vhd_fsdev_info *fsdev, 87 | struct vhd_request_queue **rqs, 88 | int num_rqs, 89 | void *priv) 90 | { 91 | VHD_VERIFY(fsdev); 92 | VHD_VERIFY(rqs); 93 | 94 | struct vhd_fsdev *dev = vhd_zalloc(sizeof(*dev)); 95 | 96 | int res = virtio_fs_init_dev(&dev->vfs, fsdev); 97 | if (res != 0) { 98 | goto error_out; 99 | } 100 | 101 | res = vhd_vdev_init_server(&dev->vdev, fsdev->socket_path, &g_virtio_fs_vdev_type, 102 | fsdev->num_queues, rqs, num_rqs, priv, NULL, NULL, 0); 103 | if (res != 0) { 104 | goto error_out; 105 | } 106 | 107 | LIST_INSERT_HEAD(&g_fsdev_list, dev, fsdevs); 108 | return &dev->vdev; 109 | 110 | error_out: 111 | vhd_free(dev); 112 | return NULL; 113 | } 114 | 115 | void vhd_unregister_fs(struct vhd_vdev *vdev, 116 | void (*unregister_complete)(void *), 117 | void *arg) 118 | { 119 | vhd_vdev_stop_server(vdev, unregister_complete, arg); 120 | } 121 | -------------------------------------------------------------------------------- /include/vhost/server.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "vhost/types.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | #define VHD_MAX_REQUEST_QUEUES 256 12 | 13 | struct vhd_vdev; 14 | struct vhd_io; 15 | 16 | /** 17 | * Logging support 18 | */ 19 | 20 | enum LogLevel { 21 | LOG_ERROR = 0, 22 | LOG_WARNING = 1, 23 | LOG_INFO = 2, 24 | LOG_DEBUG = 3 25 | }; 26 | 27 | typedef void (*log_function)(enum LogLevel level, const char *format, ...); 28 | 29 | /** 30 | * Start vhost server 31 | * 32 | * Server will spawn one native thread to wait for incoming vhost handshakes. 33 | * This thread will only handle global vhost protocol communication. 34 | * Device I/O events are handled separately by plugging into request queues. 35 | * 36 | * Return 0 on success or negative error code. 37 | */ 38 | int vhd_start_vhost_server(log_function log_fn); 39 | 40 | /** 41 | * Stop vhost server 42 | * 43 | * Stop vhost event thread which means no new vhost connections are possible 44 | */ 45 | void vhd_stop_vhost_server(void); 46 | 47 | /** 48 | * Request instance stored in request queue 49 | */ 50 | struct vhd_request { 51 | /* Device that generated this request */ 52 | struct vhd_vdev *vdev; 53 | 54 | /* Device type-specific request data */ 55 | struct vhd_io *io; 56 | }; 57 | 58 | /** 59 | * Server request queue 60 | * 61 | * Request queues are created by client and attached to vhost device(s). 62 | * Each device will then send its events to its attched queue. 63 | * This way request queues serve as a unit of load balancing. 64 | */ 65 | struct vhd_request_queue; 66 | 67 | /** 68 | * Create new request queue 69 | */ 70 | struct vhd_request_queue *vhd_create_request_queue(void); 71 | 72 | /** 73 | * Destroy request queue. 74 | * Don't call this until there are devices attached to this queue. 75 | */ 76 | void vhd_release_request_queue(struct vhd_request_queue *rq); 77 | 78 | /** 79 | * Run queue in calling thread. 80 | * Will block until any of the devices enqueue requests. 81 | * Returns: 82 | * 0 - when the request queue shouldn't be running any more 83 | * -EAGAIN - when the request should be running further 84 | * <0 - on other errors 85 | */ 86 | int vhd_run_queue(struct vhd_request_queue *rq); 87 | 88 | /** 89 | * Unblock running request queue. 90 | * After calling this vhd_run_queue will eventually return and can the be 91 | * reeintered. 92 | */ 93 | void vhd_stop_queue(struct vhd_request_queue *rq); 94 | 95 | /** 96 | * Dequeue next request. 97 | */ 98 | bool vhd_dequeue_request(struct vhd_request_queue *rq, 99 | struct vhd_request *out_req); 100 | 101 | /** 102 | * Get request queue metrics. 103 | */ 104 | void vhd_get_rq_stat(struct vhd_request_queue *rq, 105 | struct vhd_rq_metrics *metrics); 106 | 107 | /** 108 | * Block io request result 109 | */ 110 | enum vhd_bdev_io_result { 111 | VHD_BDEV_SUCCESS = 0, 112 | VHD_BDEV_IOERR, 113 | VHD_BDEV_CANCELED, 114 | }; 115 | 116 | /* 117 | * Complete the processing of the request. The backend calls this to indicate 118 | * that it's done with the request and the library may signal completion to the 119 | * guest driver and dispose of the request. 120 | */ 121 | void vhd_complete_bio(struct vhd_io *io, enum vhd_bdev_io_result status); 122 | 123 | /** 124 | * Get private data associated with vdev. 125 | */ 126 | void *vhd_vdev_get_priv(struct vhd_vdev *vdev); 127 | 128 | /** 129 | * Get statistics for device's virtio queue. 130 | */ 131 | int vhd_vdev_get_queue_stat(struct vhd_vdev *vdev, uint32_t queue_num, 132 | struct vhd_vq_metrics *metrics); 133 | 134 | #ifdef __cplusplus 135 | } 136 | #endif 137 | -------------------------------------------------------------------------------- /event.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #define VHD_EVENT_LOOP_DEFAULT_MAX_EVENTS 32 10 | 11 | /** 12 | * Event loop instance 13 | * 14 | * Each event loop will run in a thread which calls vhd_run_event_loop. 15 | * Events detected in given event loop iteration will also be handled in this 16 | * thread. 17 | * 18 | * Event loop management operations (add/remove events) are thread-safe, 19 | * although changes to list of events may not be visible until next 20 | * vhd_run_event_loop. 21 | */ 22 | struct vhd_event_loop; 23 | 24 | /** 25 | * Create new event loop. 26 | * @max_events How many events we can handle in one iteration. 27 | * Events are reported in FIFO order to avoid starvation. 28 | */ 29 | struct vhd_event_loop *vhd_create_event_loop(size_t max_events); 30 | 31 | /** 32 | * Free event loop. 33 | */ 34 | void vhd_free_event_loop(struct vhd_event_loop *evloop); 35 | 36 | /** 37 | * Run a single iteration of the event loop 38 | * 39 | * @timeout 0 to return immediately, -1 to block indefinitely, milliseconds 40 | * value otherwise. 41 | * 42 | * @return 0 if the event loop is terminated upon request 43 | * -EAGAIN if the event loop should keep going 44 | * another negative code on error 45 | */ 46 | int vhd_run_event_loop(struct vhd_event_loop *evloop, int timeout_ms); 47 | 48 | /** 49 | * Request event loop termination 50 | */ 51 | void vhd_terminate_event_loop(struct vhd_event_loop *evloop); 52 | 53 | /* I/O handling to be associated with a file descriptor */ 54 | struct vhd_io_handler; 55 | 56 | /* 57 | * Add io handler @read for @fd and attach it to @evloop. 58 | * For safe data access must be called in @evloop only. 59 | */ 60 | struct vhd_io_handler *vhd_add_io_handler(struct vhd_event_loop *evloop, 61 | int fd, int (*read)(void *), 62 | void *opaque); 63 | 64 | /* 65 | * Stop monitoring io handler @handler's file descriptor and calling its 66 | * handler functions. 67 | * For safe data access must be called in @handler's event loop only. 68 | */ 69 | 70 | int vhd_detach_io_handler(struct vhd_io_handler *handler); 71 | /* 72 | * Resume monitoring io handler @handler's file descriptor and calling its 73 | * handler functions. 74 | * For safe data access must be called in @handler's event loop only. 75 | */ 76 | int vhd_attach_io_handler(struct vhd_io_handler *handler); 77 | 78 | /* 79 | * Detach io handler @handler from its event loop and delete it. 80 | * For safe data access must be called in @handler's event loop only. 81 | */ 82 | int vhd_del_io_handler(struct vhd_io_handler *handler); 83 | 84 | /** 85 | * Clear eventfd after handling it 86 | */ 87 | void vhd_clear_eventfd(int fd); 88 | 89 | /** 90 | * Trigger eventfd 91 | */ 92 | void vhd_set_eventfd(int fd); 93 | 94 | struct vhd_bh; 95 | typedef void vhd_bh_cb(void *opaque); 96 | 97 | struct vhd_bh *vhd_bh_new(struct vhd_event_loop *ctx, 98 | vhd_bh_cb *cb, void *opaque); 99 | void vhd_bh_schedule_oneshot(struct vhd_event_loop *ctx, 100 | vhd_bh_cb *cb, void *opaque); 101 | void vhd_bh_schedule(struct vhd_bh *bh); 102 | void vhd_bh_cancel(struct vhd_bh *bh); 103 | void vhd_bh_delete(struct vhd_bh *bh); 104 | 105 | /* 106 | * Submit a work item onto @evloop and wait till it's finished. 107 | * Must not be called in the target event loop. 108 | * 109 | * Returns exactly the value which user sets by vhd_complete_work(), no other 110 | * errors possible. 111 | */ 112 | struct vhd_work; 113 | int vhd_submit_work_and_wait(struct vhd_event_loop *evloop, 114 | void (*func)(struct vhd_work *, void *), 115 | void *opaque); 116 | /* 117 | * Signal work completion to the submitter 118 | */ 119 | void vhd_complete_work(struct vhd_work *work, int ret); 120 | 121 | #ifdef __cplusplus 122 | } 123 | #endif 124 | -------------------------------------------------------------------------------- /virtio/virt_queue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "vhost/types.h" 6 | #include "vhost_spec.h" 7 | 8 | #include "virtio_spec.h" 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | /** 15 | * Describes parsed buffer chain to be handled by virtio device type 16 | */ 17 | struct virtio_iov { 18 | uint16_t niov_out; 19 | uint16_t niov_in; 20 | struct vhd_buffer *iov_out; 21 | struct vhd_buffer *iov_in; 22 | struct vhd_buffer buffers[/* niov_out + niov_in */]; 23 | }; 24 | 25 | struct vhd_memory_map; 26 | struct vhd_memory_log; 27 | 28 | struct virtio_virtq { 29 | const char *log_tag; 30 | 31 | uint32_t flags; 32 | struct virtq_desc *desc; 33 | struct virtq_avail *avail; 34 | struct virtq_used *used; 35 | uint64_t used_gpa_base; 36 | 37 | /* Size of queue in number of descriptors it can hold */ 38 | uint16_t qsz; 39 | 40 | /* Max chain length (for bug compatibility with non-compliant drivers) */ 41 | uint16_t max_chain_len; 42 | 43 | /* Shadow avail ring index */ 44 | uint16_t last_avail; 45 | 46 | /* 47 | * 2.4.5.3.1: A driver MUST NOT create a descriptor chain longer than 48 | * the Queue Size of the device 49 | * Thus we can preallocate a scratch area of a known size to accumulate 50 | * scatter-gather segments before handing them over to the device. 51 | */ 52 | uint16_t niov_out; 53 | uint16_t niov_in; 54 | struct vhd_buffer *buffers; 55 | 56 | /* 57 | * Virtqueue is broken, probably because there is an invalid descriptor 58 | * chain in it. 59 | * Broken status is sticky and so far cannot be repared. 60 | */ 61 | bool broken; 62 | 63 | 64 | /* 65 | * If set, VIRTIO_F_RING_EVENT_IDX is negotiated for this queue and 66 | * avail/used_event fields must be used for notification. 67 | */ 68 | bool has_event_idx; 69 | 70 | /* 71 | * eventfd for used buffers notification. 72 | * can be reset after virtq is started. 73 | */ 74 | int notify_fd; 75 | 76 | /* 77 | * Whether the processing of this virtq is enabled. 78 | * Can be toggled after virtq is started. 79 | */ 80 | bool enabled; 81 | 82 | /* inflight information */ 83 | uint64_t req_cnt; 84 | struct inflight_split_region *inflight_region; 85 | bool inflight_check; 86 | 87 | /* 88 | * these objects are per-device but storing a link on virtqueue facilitates 89 | * bookkeeping 90 | */ 91 | struct vhd_memory_map *mm; 92 | struct vhd_memory_log *log; 93 | 94 | /* Usage statistics */ 95 | struct vq_stat { 96 | /* Metrics provided to users */ 97 | struct vhd_vq_metrics metrics; 98 | 99 | /* Metrics service info fields. Not provided to uses */ 100 | /* timestamps for periodic metrics */ 101 | time_t period_start_ts; 102 | } stat; 103 | }; 104 | 105 | void virtio_virtq_init(struct virtio_virtq *vq); 106 | 107 | void virtio_virtq_release(struct virtio_virtq *vq); 108 | 109 | bool virtq_is_broken(struct virtio_virtq *vq); 110 | 111 | void mark_broken(struct virtio_virtq *vq); 112 | 113 | typedef void(*virtq_handle_buffers_cb)(void *arg, 114 | struct virtio_virtq *vq, 115 | struct virtio_iov *iov); 116 | int virtq_dequeue_many(struct virtio_virtq *vq, 117 | virtq_handle_buffers_cb handle_buffers_cb, 118 | void *arg); 119 | 120 | void virtq_push(struct virtio_virtq *vq, struct virtio_iov *iov, uint32_t len); 121 | 122 | void virtq_set_notify_fd(struct virtio_virtq *vq, int fd); 123 | 124 | void virtio_free_iov(struct virtio_iov *iov); 125 | uint16_t virtio_iov_get_head(struct virtio_iov *iov); 126 | 127 | void virtio_virtq_get_stat(struct virtio_virtq *vq, 128 | struct vhd_vq_metrics *metrics); 129 | 130 | void abort_request(struct virtio_virtq *vq, struct virtio_iov *iov); 131 | #ifdef __cplusplus 132 | } 133 | #endif 134 | -------------------------------------------------------------------------------- /virtio/virtio_fs.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "vhost/fs.h" 4 | 5 | #include "virtio_fs.h" 6 | #include "virtio_fs_spec.h" 7 | 8 | #include "bio.h" 9 | #include "virt_queue.h" 10 | #include "logging.h" 11 | #include "server_internal.h" 12 | #include "vdev.h" 13 | 14 | /******************************************************************************/ 15 | 16 | struct virtio_fs_io { 17 | struct virtio_virtq *vq; 18 | struct virtio_iov *iov; 19 | 20 | struct vhd_io io; 21 | struct vhd_fs_io fs_io; 22 | }; 23 | 24 | /******************************************************************************/ 25 | 26 | static void complete_request(struct vhd_io *io) 27 | { 28 | struct virtio_fs_io *vbio = containerof(io, struct virtio_fs_io, io); 29 | struct virtio_iov *viov = vbio->iov; 30 | /* if IN iov has at least one buffer it accomodates fuse_out_header */ 31 | struct virtio_fs_out_header *out = 32 | viov->niov_in ? viov->iov_in[0].base : NULL; 33 | uint32_t len = out ? out->len : 0; 34 | 35 | if (likely(io->status != VHD_BDEV_CANCELED)) { 36 | virtq_push(vbio->vq, vbio->iov, len); 37 | } 38 | 39 | virtio_free_iov(viov); 40 | vhd_free(vbio); 41 | } 42 | 43 | static int virtio_fs_handle_request(struct virtio_virtq *vq, 44 | struct vhd_io *io) 45 | { 46 | io->vring = VHD_VRING_FROM_VQ(vq); 47 | return vhd_enqueue_request(vhd_get_rq_for_vring(io->vring), io); 48 | } 49 | 50 | static void handle_buffers(void *arg, struct virtio_virtq *vq, struct virtio_iov *iov) 51 | { 52 | uint16_t niov = iov->niov_in + iov->niov_out; 53 | (void)arg; 54 | 55 | /* 56 | * Assume legacy message framing without VIRTIO_F_ANY_LAYOUT: 57 | * - virtio IN / FUSE OUT segments, with the first one fully containing 58 | * fuse_in_header 59 | * - virtio OUT / FUSE IN segments, with the first one fully containing 60 | * fuse_out_header (except FUSE_FORGET and FUSE_BATCH_FORGET which have 61 | * no response part at all) 62 | */ 63 | 64 | struct virtio_fs_in_header *in; 65 | struct virtio_fs_out_header *out; 66 | 67 | if (iov->niov_in && iov->iov_in[0].len < sizeof(*out)) { 68 | VHD_LOG_ERROR("No room for response in the request"); 69 | abort_request(vq, iov); 70 | return; 71 | } 72 | 73 | if (!iov->niov_out || iov->iov_out[0].len < sizeof(*in)) { 74 | VHD_LOG_ERROR("Malformed request header"); 75 | abort_request(vq, iov); 76 | return; 77 | } 78 | 79 | in = iov->iov_out[0].base; 80 | out = iov->niov_in ? iov->iov_in[0].base : NULL; 81 | 82 | struct virtio_fs_io *bio = vhd_zalloc(sizeof(*bio)); 83 | bio->vq = vq; 84 | bio->iov = iov; 85 | bio->io.completion_handler = complete_request; 86 | 87 | bio->fs_io.sglist.nbuffers = niov; 88 | bio->fs_io.sglist.buffers = iov->buffers; 89 | 90 | int res = virtio_fs_handle_request(bio->vq, &bio->io); 91 | if (res != 0) { 92 | VHD_LOG_ERROR("request submission failed with %d", res); 93 | 94 | if (out) { 95 | out->len = sizeof(*out); 96 | out->error = res; 97 | out->unique = in->unique; 98 | } 99 | 100 | complete_request(&bio->io); 101 | return; 102 | } 103 | } 104 | 105 | /******************************************************************************/ 106 | 107 | int virtio_fs_init_dev( 108 | struct virtio_fs_dev *dev, 109 | struct vhd_fsdev_info *fsdev) 110 | { 111 | VHD_VERIFY(dev); 112 | VHD_VERIFY(fsdev); 113 | 114 | dev->fsdev = fsdev; 115 | 116 | dev->config = (struct virtio_fs_config) { 117 | .num_request_queues = fsdev->num_queues, 118 | }; 119 | if (fsdev->tag) { 120 | memcpy(dev->config.tag, fsdev->tag, 121 | MIN(strlen(fsdev->tag), sizeof(dev->config.tag))); 122 | } 123 | 124 | return 0; 125 | } 126 | 127 | int virtio_fs_dispatch_requests(struct virtio_fs_dev *dev, 128 | struct virtio_virtq *vq) 129 | { 130 | VHD_VERIFY(dev); 131 | VHD_VERIFY(vq); 132 | 133 | return virtq_dequeue_many(vq, handle_buffers, dev); 134 | } 135 | 136 | struct vhd_fs_io *vhd_get_fs_io(struct vhd_io *io) 137 | { 138 | struct virtio_fs_io *bio = containerof(io, struct virtio_fs_io, io); 139 | return &bio->fs_io; 140 | } 141 | -------------------------------------------------------------------------------- /virtio/virtio_blk_spec.h: -------------------------------------------------------------------------------- 1 | /* 2 | * virtio blk protocol definitions according to virtio 1.0 spec 3 | */ 4 | 5 | #pragma once 6 | 7 | #include "platform.h" 8 | #include "virtio_types.h" 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #define VIRTIO_BLK_SECTOR_SIZE 512 15 | #define VIRTIO_BLK_SECTOR_SHIFT 9 16 | #define VIRTIO_BLK_DISKID_LENGTH 20 17 | #define VIRTIO_BLK_STATUS_LENGTH 1 18 | 19 | /* Feature bits */ 20 | #define VIRTIO_BLK_F_SIZE_MAX 1 /* Maximum size of any single segment is in size_max. */ 21 | #define VIRTIO_BLK_F_SEG_MAX 2 /* Maximum number of segments in a request is in seg_max. */ 22 | #define VIRTIO_BLK_F_GEOMETRY 4 /* Disk-style geometry specified in geometry. */ 23 | #define VIRTIO_BLK_F_RO 5 /* Device is read-only. */ 24 | #define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is in blk_size. */ 25 | #define VIRTIO_BLK_F_FLUSH 9 /* Cache flush command support. */ 26 | #define VIRTIO_BLK_F_TOPOLOGY 10 /* Device exports information on optimal I/O alignment. */ 27 | #define VIRTIO_BLK_F_CONFIG_WCE 11 /* Device can toggle its cache between writeback and writethrough modes. */ 28 | #define VIRTIO_BLK_F_DISCARD 13 /* Device can support discard command */ 29 | #define VIRTIO_BLK_F_WRITE_ZEROES 14 /* Device supports write-zeroes requests */ 30 | 31 | /* Custom extentions */ 32 | #define VIRTIO_BLK_F_MQ 12 /* Device reports maximum supported queues in numqueues config field */ 33 | 34 | /* Legacy interface: feature bits */ 35 | #define VIRTIO_BLK_F_BARRIER 0 /* Device supports request barriers. */ 36 | #define VIRTIO_BLK_F_SCSI 7 /* Device supports scsi packet commands. */ 37 | 38 | /* 39 | * Device configuration layout. 40 | * The capacity of the device (expressed in 512-byte sectors) is always present. 41 | * The availability of the others all depend on various feature bits as 42 | * indicated above. 43 | */ 44 | struct VHD_PACKED virtio_blk_config { 45 | le64 capacity; 46 | le32 size_max; 47 | le32 seg_max; 48 | struct VHD_PACKED virtio_blk_geometry { 49 | le16 cylinders; 50 | u8 heads; 51 | u8 sectors; 52 | } geometry; 53 | le32 blk_size; 54 | struct VHD_PACKED virtio_blk_topology { 55 | /* # of logical blocks per physical block (log2) */ 56 | u8 physical_block_exp; 57 | /* offset of first aligned logical block */ 58 | u8 alignment_offset; 59 | /* suggested minimum I/O size in blocks */ 60 | le16 min_io_size; 61 | /* optimal (suggested maximum) I/O size in blocks */ 62 | le32 opt_io_size; 63 | } topology; 64 | u8 writeback; 65 | u8 _reserved; 66 | le16 numqueues; 67 | 68 | /* VIRTIO_BLK_F_DISCARD-specific fields */ 69 | le32 max_discard_sectors; 70 | le32 max_discard_seg; 71 | le32 discard_sector_alignment; 72 | 73 | /* VIRTIO_BLK_F_WRITE_ZEROES-specific fields */ 74 | le32 max_write_zeroes_sectors; 75 | le32 max_write_zeroes_seg; 76 | u8 write_zeroes_may_unmap; 77 | u8 _reserved1[3]; 78 | }; 79 | 80 | /* 81 | * Device Operation 82 | * The driver queues requests to the virtqueue, and they are used by the device 83 | * (not necessarily in order). 84 | * 85 | * Request is a variable sized structure: 86 | * struct virtio_blk_req { 87 | * le32 type; 88 | * le32 reserved; 89 | * le64 sector; 90 | * u8 data[][512]; 91 | * u8 status; 92 | * }; 93 | */ 94 | struct virtio_blk_req_hdr { 95 | #define VIRTIO_BLK_T_IN 0 /* Device read */ 96 | #define VIRTIO_BLK_T_OUT 1 /* Device write */ 97 | #define VIRTIO_BLK_T_FLUSH 4 /* Flush */ 98 | #define VIRTIO_BLK_T_GET_ID 8 /* Get device id */ 99 | #define VIRTIO_BLK_T_DISCARD 11 /* Discard */ 100 | #define VIRTIO_BLK_T_WRITE_ZEROES 13 /* Write zeroes */ 101 | le32 type; 102 | le32 reserved; 103 | le64 sector; 104 | }; 105 | 106 | struct virtio_blk_discard_write_zeroes { 107 | le64 sector; 108 | le32 num_sectors; 109 | struct { 110 | le32 unmap:1; 111 | le32 reserved:31; 112 | } flags; 113 | }; 114 | 115 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_req_hdr) == 16); 116 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_discard_write_zeroes) == 16); 117 | 118 | #define VIRTIO_BLK_S_OK 0 119 | #define VIRTIO_BLK_S_IOERR 1 120 | #define VIRTIO_BLK_S_UNSUPP 2 121 | 122 | #ifdef __cplusplus 123 | } 124 | #endif 125 | -------------------------------------------------------------------------------- /include/vhost/blockdev.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "vhost/types.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | struct vhd_io; 12 | struct vhd_request_queue; 13 | struct vhd_vdev; 14 | 15 | /* 16 | * vhd_bdev_io values are always expressed in these units for any 17 | * vhd_bdev_info->sector_size configuration. 18 | */ 19 | #define VHD_SECTOR_SHIFT (9) 20 | #define VHD_SECTOR_SIZE (1ull << VHD_SECTOR_SHIFT) 21 | 22 | #define VHD_BDEV_F_READONLY (1ull << 0) 23 | #define VHD_BDEV_F_DISCARD (1ull << 1) 24 | #define VHD_BDEV_F_WRITE_ZEROES (1ull << 2) 25 | 26 | /** 27 | * Client-supplied block device backend definition 28 | */ 29 | struct vhd_bdev_info { 30 | /* Blockdev serial */ 31 | const char *serial; 32 | 33 | /* Path to create listen sockets */ 34 | const char *socket_path; 35 | 36 | /* 37 | * Physical block size in bytes, must be a multiple of sector_size 38 | * or of VHD_SECTOR_SIZE if sector_size is 0. 39 | */ 40 | uint32_t block_size; 41 | 42 | /* 43 | * Logical sector size in bytes, VHD_SECTOR_SIZE is used if 44 | * this value is set to 0. 45 | * 46 | * Note that the virtio specification technically provides this value as 47 | * a suggestion to the guest. Thus, a 4096-byte sector size disk may still 48 | * generate 512-byte requests. Technically all existing software treats 49 | * this value as a logical sector size, but care must still be taken. 50 | */ 51 | uint32_t sector_size; 52 | 53 | /* Optimal io size in bytes */ 54 | uint32_t optimal_io_size; 55 | 56 | /* Total number of backend queues this device supports */ 57 | uint32_t num_queues; 58 | 59 | /* Device size in blocks */ 60 | uint64_t total_blocks; 61 | 62 | /* Supported VHD_BDEV_F_* features */ 63 | uint64_t features; 64 | 65 | /* Gets called after mapping guest memory region */ 66 | int (*map_cb)(void *addr, size_t len); 67 | 68 | /* Gets called before unmapping guest memory region */ 69 | int (*unmap_cb)(void *addr, size_t len); 70 | 71 | /* 72 | * If set to a non-zero value, PTEs backing the guest memory regions 73 | * for this blockdev are flushed (unmapped and mapped back) every 74 | * N bytes processed by the backend. E.g. if this value is 1024, PTEs 75 | * will be flushed after the guest reads/writes 2 blocks. 76 | */ 77 | size_t pte_flush_byte_threshold; 78 | }; 79 | 80 | static inline bool vhd_blockdev_is_readonly(const struct vhd_bdev_info *bdev) 81 | { 82 | return bdev->features & VHD_BDEV_F_READONLY; 83 | } 84 | 85 | static inline bool vhd_blockdev_has_discard(const struct vhd_bdev_info *bdev) 86 | { 87 | return bdev->features & VHD_BDEV_F_DISCARD; 88 | } 89 | 90 | static inline bool vhd_blockdev_has_write_zeroes( 91 | const struct vhd_bdev_info *bdev) 92 | { 93 | return bdev->features & VHD_BDEV_F_WRITE_ZEROES; 94 | } 95 | 96 | static inline uint32_t vhd_blockdev_sector_size( 97 | const struct vhd_bdev_info *bdev) 98 | { 99 | return bdev->sector_size ? bdev->sector_size : VHD_SECTOR_SIZE; 100 | } 101 | 102 | /** 103 | * Block io request type 104 | */ 105 | enum vhd_bdev_io_type { 106 | VHD_BDEV_READ, 107 | VHD_BDEV_WRITE, 108 | VHD_BDEV_DISCARD, 109 | VHD_BDEV_WRITE_ZEROES, 110 | }; 111 | 112 | /** 113 | * In-flight blockdev io request 114 | */ 115 | struct vhd_bdev_io { 116 | enum vhd_bdev_io_type type; 117 | 118 | /* 119 | * These values are ALWAYS expressed in VHD_SECTOR_SIZE (aka 512-byte) 120 | * units, even if this device has a larger sector_size. 121 | */ 122 | uint64_t first_sector; 123 | uint64_t total_sectors; 124 | 125 | struct vhd_sglist sglist; 126 | }; 127 | 128 | struct vhd_bdev_io *vhd_get_bdev_io(struct vhd_io *io); 129 | 130 | /** 131 | * Register a vhost block device. 132 | * 133 | * After registering a device, it will be accessible to clients through a vhost 134 | * socket. 135 | * All requests are submitted to attacher request queues for caller to process. 136 | * 137 | * @bdev Caller block device info. The structure is used only for 138 | * initialization and may be freed by caller after 139 | * vhd_register_blockdev() returns. 140 | * @rqs An array of request queues to use for dispatching device I/O 141 | * requests. 142 | * @num_rqs Number of request queues in the @rqs array. 143 | * @priv Caller private data to associate with resulting vdev. 144 | */ 145 | struct vhd_vdev *vhd_register_blockdev(const struct vhd_bdev_info *bdev, 146 | struct vhd_request_queue **rqs, 147 | int num_rqs, void *priv); 148 | 149 | /** 150 | * Unregister a vhost block device. 151 | */ 152 | void vhd_unregister_blockdev(struct vhd_vdev *vdev, 153 | void (*unregister_complete)(void *), void *arg); 154 | 155 | /** 156 | * Resize a vhost block device. 157 | * 158 | * The function change virtio config, that client may read by 159 | * VHOST_USER_GET_CONFIG command. 160 | * 161 | * Note, that client is not notified about config change, the caller is 162 | * responsible for this. 163 | */ 164 | void vhd_blockdev_set_total_blocks(struct vhd_vdev *vdev, 165 | uint64_t total_blocks); 166 | 167 | #ifdef __cplusplus 168 | } 169 | #endif 170 | -------------------------------------------------------------------------------- /blockdev.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "vhost/blockdev.h" 5 | #include "server_internal.h" 6 | #include "vdev.h" 7 | #include "logging.h" 8 | 9 | #include "bio.h" 10 | #include "virtio/virtio_blk.h" 11 | 12 | struct vhd_bdev { 13 | /* Base vdev */ 14 | struct vhd_vdev vdev; 15 | 16 | /* VM-facing interface type */ 17 | struct virtio_blk_dev vblk; 18 | 19 | LIST_ENTRY(vhd_bdev) blockdevs; 20 | }; 21 | 22 | static LIST_HEAD(, vhd_bdev) g_bdev_list = LIST_HEAD_INITIALIZER(g_bdev_list); 23 | 24 | #define VHD_BLOCKDEV_FROM_VDEV(ptr) containerof(ptr, struct vhd_bdev, vdev) 25 | 26 | /*////////////////////////////////////////////////////////////////////////////*/ 27 | 28 | static uint64_t vblk_get_features(struct vhd_vdev *vdev) 29 | { 30 | struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev); 31 | return virtio_blk_get_features(&dev->vblk); 32 | } 33 | 34 | static int vblk_set_features(struct vhd_vdev *vdev, uint64_t features) 35 | { 36 | return 0; 37 | } 38 | 39 | /* vhost_get_config assumes that config is less than VHOST_USER_CONFIG_SPACE_MAX */ 40 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_config) <= VHOST_USER_CONFIG_SPACE_MAX); 41 | 42 | static size_t vblk_get_config(struct vhd_vdev *vdev, void *cfgbuf, 43 | size_t bufsize, size_t offset) 44 | { 45 | struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev); 46 | 47 | return virtio_blk_get_config(&dev->vblk, cfgbuf, bufsize, offset); 48 | } 49 | 50 | static int vblk_dispatch(struct vhd_vdev *vdev, struct vhd_vring *vring) 51 | { 52 | struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev); 53 | return virtio_blk_dispatch_requests(&dev->vblk, &vring->vq); 54 | } 55 | 56 | static void vblk_free(struct vhd_vdev *vdev) 57 | { 58 | struct vhd_bdev *bdev = VHD_BLOCKDEV_FROM_VDEV(vdev); 59 | 60 | LIST_REMOVE(bdev, blockdevs); 61 | virtio_blk_destroy_dev(&bdev->vblk); 62 | vhd_free(bdev); 63 | } 64 | 65 | static const struct vhd_vdev_type g_virtio_blk_vdev_type = { 66 | .desc = "virtio-blk", 67 | .get_features = vblk_get_features, 68 | .set_features = vblk_set_features, 69 | .get_config = vblk_get_config, 70 | .dispatch_requests = vblk_dispatch, 71 | .free = vblk_free, 72 | }; 73 | 74 | struct set_total_blocks { 75 | struct vhd_vdev *vdev; 76 | uint64_t total_blocks; 77 | }; 78 | 79 | static void set_total_blocks_entry(struct vhd_work *work, void *opaque) 80 | { 81 | struct set_total_blocks *stb = opaque; 82 | struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(stb->vdev); 83 | 84 | virtio_blk_set_total_blocks(&dev->vblk, stb->total_blocks); 85 | vhd_complete_work(work, 0); 86 | } 87 | 88 | void vhd_blockdev_set_total_blocks(struct vhd_vdev *vdev, uint64_t total_blocks) 89 | { 90 | struct set_total_blocks stb = { 91 | .vdev = vdev, 92 | .total_blocks = total_blocks, 93 | }; 94 | 95 | VHD_OBJ_INFO(vdev, "Set total blocks %" PRIu64, total_blocks); 96 | 97 | /* 98 | * Modify virtio config in g_vhost_evloop, to not interfere with .get_config 99 | * 100 | * We don't need vdev_submit_work_and_wait() logic here, as setting 101 | * total_blocks in config is unrelated stopping process, so it should not be 102 | * a problem intersect with wdev_stop_work work. 103 | */ 104 | int ret = vhd_submit_ctl_work_and_wait(set_total_blocks_entry, &stb); 105 | VHD_VERIFY(ret == 0); 106 | } 107 | 108 | static bool blockdev_validate_features(const struct vhd_bdev_info *bdev) 109 | { 110 | const uint64_t valid_features = VHD_BDEV_F_READONLY | 111 | VHD_BDEV_F_DISCARD | 112 | VHD_BDEV_F_WRITE_ZEROES; 113 | return (bdev->features & valid_features) == bdev->features; 114 | } 115 | 116 | struct vhd_vdev *vhd_register_blockdev(const struct vhd_bdev_info *bdev, 117 | struct vhd_request_queue **rqs, 118 | int num_rqs, void *priv) 119 | { 120 | int res; 121 | uint32_t sector_size; 122 | 123 | if (!bdev->total_blocks || !bdev->block_size) { 124 | VHD_LOG_ERROR("Zero blockdev capacity %" PRIu64 " * %" PRIu32, 125 | bdev->total_blocks, bdev->block_size); 126 | return NULL; 127 | } 128 | 129 | sector_size = vhd_blockdev_sector_size(bdev); 130 | 131 | if (sector_size & (sector_size - 1) || sector_size % VHD_SECTOR_SIZE) { 132 | VHD_LOG_ERROR("Invalid sector size %" PRIu32 " must be a power " 133 | "of two and multiple of %llu", sector_size, 134 | VHD_SECTOR_SIZE); 135 | return NULL; 136 | } 137 | 138 | if ((bdev->block_size & (bdev->block_size - 1)) || 139 | bdev->block_size % sector_size) { 140 | VHD_LOG_ERROR("Block size %" PRIu32 " is not" 141 | " a power of two multiple of sector size (%" PRIu32 ")", 142 | bdev->block_size, sector_size); 143 | return NULL; 144 | } 145 | 146 | if (bdev->optimal_io_size % bdev->block_size) { 147 | VHD_LOG_ERROR("Optimal io size %" PRIu32 " is not" 148 | " a multiple of block size (%" PRIu32 ")", 149 | bdev->optimal_io_size, bdev->block_size); 150 | return NULL; 151 | } 152 | 153 | if (bdev->total_blocks > (UINT64_MAX / bdev->block_size)) { 154 | VHD_LOG_ERROR("Disk capacity %" PRIu64 " is too large!", 155 | bdev->total_blocks); 156 | return NULL; 157 | } 158 | 159 | if (!blockdev_validate_features(bdev)) { 160 | VHD_LOG_ERROR("Invalid blockdev features %" PRIu64, bdev->features); 161 | return NULL; 162 | } 163 | 164 | struct vhd_bdev *dev = vhd_zalloc(sizeof(*dev)); 165 | 166 | virtio_blk_init_dev(&dev->vblk, bdev); 167 | 168 | res = vhd_vdev_init_server(&dev->vdev, bdev->socket_path, 169 | &g_virtio_blk_vdev_type, 170 | bdev->num_queues, rqs, num_rqs, priv, 171 | bdev->map_cb, bdev->unmap_cb, 172 | bdev->pte_flush_byte_threshold); 173 | if (res != 0) { 174 | goto error_out; 175 | } 176 | 177 | LIST_INSERT_HEAD(&g_bdev_list, dev, blockdevs); 178 | return &dev->vdev; 179 | 180 | error_out: 181 | virtio_blk_destroy_dev(&dev->vblk); 182 | vhd_free(dev); 183 | return NULL; 184 | } 185 | 186 | void vhd_unregister_blockdev(struct vhd_vdev *vdev, 187 | void (*unregister_complete)(void *), void *arg) 188 | { 189 | vhd_vdev_stop_server(vdev, unregister_complete, arg); 190 | } 191 | -------------------------------------------------------------------------------- /vhost_spec.h: -------------------------------------------------------------------------------- 1 | /** 2 | * vhost-user protocol definitions 3 | */ 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | /* 14 | * Define protocol structures and definitions based on the vhost user 15 | * protocol specification: 16 | * https://github.com/qemu/qemu/blob/master/docs/interop/vhost-user.txt 17 | */ 18 | 19 | /* Vhost user protocol flags. */ 20 | /* This is a vhost protocol version. */ 21 | #define VHOST_USER_VERSION_MASK 0x3 22 | #define VHOST_USER_MSG_VERSION 0x1 23 | #define VHOST_USER_MSG_FLAGS_REPLY ((1 << 2) | VHOST_USER_MSG_VERSION) 24 | #define VHOST_USER_MSG_FLAGS_REPLY_ACK (1 << 3) 25 | 26 | /* 27 | * Vhost user protocol features (GET_PROTOCOL_FEATURES and 28 | * SET_PROTOCOL_FEATURES commands). 29 | */ 30 | #define VHOST_USER_PROTOCOL_F_MQ 0 31 | #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 32 | #define VHOST_USER_PROTOCOL_F_RARP 2 33 | #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 34 | #define VHOST_USER_PROTOCOL_F_MTU 4 35 | #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 36 | #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 37 | #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 38 | #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 39 | #define VHOST_USER_PROTOCOL_F_CONFIG 9 40 | #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 41 | #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15 42 | 43 | /* Vhost user features (GET_FEATURES and SET_FEATURES commands). */ 44 | #define VHOST_F_LOG_ALL 26 45 | #define VHOST_USER_F_PROTOCOL_FEATURES 30 46 | #define VIRTIO_F_RING_INDIRECT_DESC 28 47 | #define VIRTIO_F_RING_EVENT_IDX 29 48 | #define VIRTIO_F_VERSION_1 32 49 | 50 | /* 51 | * Invalid FD bit for the VHOST_USER_SET_VRING_KICK and 52 | * VHOST_USER_SET_VRING_CALL commands. If this bit is 53 | * set then the communication is forced to use polling 54 | * instead of using for a kick. 55 | */ 56 | #define VHOST_VRING_INVALID_FD (1 << 8) 57 | #define VHOST_VRING_IDX_MASK 0xff 58 | 59 | /* Maximum size of the device PCI configuration space. */ 60 | #define VHOST_USER_CONFIG_SPACE_MAX 256 61 | 62 | /* 63 | * According to the protocol specification this is the maximum number 64 | * of memory regions sent in one command. Also it is the maximum number 65 | * of file descriptors. 66 | */ 67 | #define VHOST_USER_MEM_REGIONS_MAX 8 68 | #define VHOST_USER_MAX_FDS VHOST_USER_MEM_REGIONS_MAX 69 | 70 | /* Define request types. */ 71 | enum { 72 | VHOST_USER_NONE = 0, 73 | VHOST_USER_GET_FEATURES = 1, 74 | VHOST_USER_SET_FEATURES = 2, 75 | VHOST_USER_SET_OWNER = 3, 76 | VHOST_USER_RESET_OWNER = 4, 77 | VHOST_USER_SET_MEM_TABLE = 5, 78 | VHOST_USER_SET_LOG_BASE = 6, 79 | VHOST_USER_SET_LOG_FD = 7, 80 | VHOST_USER_SET_VRING_NUM = 8, 81 | VHOST_USER_SET_VRING_ADDR = 9, 82 | VHOST_USER_SET_VRING_BASE = 10, 83 | VHOST_USER_GET_VRING_BASE = 11, 84 | VHOST_USER_SET_VRING_KICK = 12, 85 | VHOST_USER_SET_VRING_CALL = 13, 86 | VHOST_USER_SET_VRING_ERR = 14, 87 | VHOST_USER_GET_PROTOCOL_FEATURES = 15, 88 | VHOST_USER_SET_PROTOCOL_FEATURES = 16, 89 | VHOST_USER_GET_QUEUE_NUM = 17, 90 | VHOST_USER_SET_VRING_ENABLE = 18, 91 | VHOST_USER_SEND_RARP = 19, 92 | VHOST_USER_NET_SET_MTU = 20, 93 | VHOST_USER_SET_SLAVE_REQ_FD = 21, 94 | VHOST_USER_IOTLB_MSG = 22, 95 | VHOST_USER_SET_VRING_ENDIAN = 23, 96 | VHOST_USER_GET_CONFIG = 24, 97 | VHOST_USER_SET_CONFIG = 25, 98 | VHOST_USER_CREATE_CRYPTO_SESSION = 26, 99 | VHOST_USER_CLOSE_CRYPTO_SESSION = 27, 100 | VHOST_USER_POSTCOPY_ADVISE = 28, 101 | VHOST_USER_POSTCOPY_LISTEN = 29, 102 | VHOST_USER_POSTCOPY_END = 30, 103 | VHOST_USER_GET_INFLIGHT_FD = 31, 104 | VHOST_USER_SET_INFLIGHT_FD = 32, 105 | VHOST_USER_GET_MAX_MEM_SLOTS = 36, 106 | VHOST_USER_ADD_MEM_REG = 37, 107 | VHOST_USER_REM_MEM_REG = 38, 108 | }; 109 | 110 | struct vhost_user_mem_region { 111 | uint64_t guest_addr; 112 | uint64_t size; 113 | uint64_t user_addr; 114 | uint64_t mmap_offset; 115 | }; 116 | 117 | struct vhost_user_mem_single_mem_desc { 118 | uint64_t _padding; 119 | struct vhost_user_mem_region region; 120 | }; 121 | 122 | struct vhost_user_mem_desc { 123 | uint32_t nregions; 124 | uint32_t _padding; 125 | struct vhost_user_mem_region regions[VHOST_USER_MEM_REGIONS_MAX]; 126 | }; 127 | 128 | struct vhost_user_vring_state { 129 | uint32_t index; 130 | uint32_t num; 131 | }; 132 | 133 | struct vhost_user_vring_addr { 134 | uint32_t index; 135 | #define VHOST_VRING_F_LOG (1 << 0) 136 | uint32_t flags; 137 | uint64_t desc_addr; 138 | uint64_t used_addr; 139 | uint64_t avail_addr; 140 | uint64_t used_gpa_base; 141 | }; 142 | 143 | struct vhost_user_config_space { 144 | uint32_t offset; 145 | uint32_t size; 146 | uint32_t flags; 147 | uint8_t payload[VHOST_USER_CONFIG_SPACE_MAX]; 148 | }; 149 | #define VHOST_CONFIG_HDR_SIZE (offsetof(struct vhost_user_config_space, payload)) 150 | 151 | struct vhost_user_inflight_desc { 152 | uint64_t mmap_size; 153 | uint64_t mmap_offset; 154 | uint16_t num_queues; 155 | uint16_t queue_size; 156 | }; 157 | 158 | struct inflight_split_desc { 159 | uint8_t inflight; 160 | uint8_t padding[5]; 161 | uint16_t next; 162 | uint64_t counter; 163 | }; 164 | 165 | struct inflight_split_region { 166 | uint64_t features; 167 | uint16_t version; 168 | uint16_t desc_num; 169 | uint16_t last_batch_head; 170 | uint16_t used_idx; 171 | struct inflight_split_desc desc[]; 172 | }; 173 | 174 | struct vhost_user_log { 175 | uint64_t size; 176 | uint64_t offset; 177 | }; 178 | 179 | struct vhost_user_msg_hdr { 180 | uint32_t req; 181 | uint32_t flags; 182 | uint32_t size; 183 | }; 184 | 185 | union vhost_user_msg_payload { 186 | /* 187 | * VHOST_USER_GET_QUEUE_NUM, VHOST_USER_GET_PROTOCOL_FEATURES, 188 | * VHOST_USER_GET_FEATURES, 189 | * VHOST_USER_SET_VRING_KICK, VHOST_USER_SET_VRING_CALL 190 | */ 191 | uint64_t u64; 192 | /* VHOST_USER_GET_CONFIG, VHOST_USER_SET_CONFIG */ 193 | struct vhost_user_config_space config; 194 | /* VHOST_USER_SET_MEM_TABLE */ 195 | struct vhost_user_mem_desc mem_desc; 196 | /* 197 | * VHOST_USER_GET_VRING_BASE, VHOST_USER_SET_VRING_BASE, 198 | * VHOST_USER_SET_VRING_NUM 199 | */ 200 | struct vhost_user_vring_state vring_state; 201 | /* VHOST_USER_SET_VRING_ADDR */ 202 | struct vhost_user_vring_addr vring_addr; 203 | /* VHOST_USER_GET_INFLIGHT_FD, VHOST_USER_SET_INFLIGHT_FD */ 204 | struct vhost_user_inflight_desc inflight_desc; 205 | /* VHOST_USER_SET_LOG_BASE */ 206 | struct vhost_user_log log; 207 | }; 208 | 209 | #ifdef __cplusplus 210 | } 211 | #endif 212 | -------------------------------------------------------------------------------- /tests/test_libvhost.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import shutil 4 | import signal 5 | import time 6 | import pytest 7 | from typing import Tuple, List, Generator 8 | 9 | 10 | # 1 GiB should be enough 11 | DISK_IMAGE_SIZE = 1024 * 1024 * 1024 12 | WORK_DIR = "work" 13 | TEST_SERVER_BINARY_ENV_PATH = "TEST_SERVER_BINARY" 14 | BLKIO_BENCH_ENV_PATH = "BLKIO_BENCH_BINARY" 15 | 16 | 17 | def base_dir_abs_path() -> str: 18 | return os.path.dirname(os.path.abspath(__file__)) 19 | 20 | 21 | def build_dir() -> str: 22 | return os.path.join(base_dir_abs_path(), os.pardir, "build") 23 | 24 | 25 | @pytest.fixture(scope="session") 26 | def blkio_bench() -> str: 27 | env_path = os.environ.get(BLKIO_BENCH_ENV_PATH) 28 | if env_path and os.path.exists(env_path): 29 | return env_path 30 | 31 | blkio_bench_path = os.path.join( 32 | build_dir(), "subprojects", "libblkio", "examples", "blkio-bench" 33 | ) 34 | if os.path.exists(blkio_bench_path): 35 | return blkio_bench_path 36 | 37 | raise RuntimeError("This test requires blkio-bench example program " 38 | "which comes with libblkio") 39 | 40 | 41 | @pytest.fixture(scope="session") 42 | def vhost_user_test_server() -> str: 43 | env_path = os.environ.get(TEST_SERVER_BINARY_ENV_PATH) 44 | if env_path and os.path.exists(env_path): 45 | return env_path 46 | 47 | server_path = os.path.join( 48 | build_dir(), "tests", "vhost-user-blk-test-server" 49 | ) 50 | if os.path.exists(server_path): 51 | return server_path 52 | 53 | raise RuntimeError("A valid path to the test server must be specified " 54 | f"in the {TEST_SERVER_BINARY_ENV_PATH} variable") 55 | 56 | 57 | @pytest.fixture(scope="session") 58 | def work_dir() -> Generator[str, None, None]: 59 | work_dir_path = os.path.join(base_dir_abs_path(), WORK_DIR) 60 | 61 | os.makedirs(work_dir_path, exist_ok=True) 62 | yield work_dir_path 63 | shutil.rmtree(work_dir_path) 64 | 65 | 66 | @pytest.fixture(scope="session") 67 | def disk_image(work_dir: str) -> Generator[str, None, None]: 68 | disk_image_path = os.path.join(work_dir, "disk-image.raw") 69 | 70 | with open(disk_image_path, "wb+") as f: 71 | f.seek(DISK_IMAGE_SIZE - 1) 72 | f.write(bytearray(1)) 73 | 74 | yield disk_image_path 75 | os.remove(disk_image_path) 76 | 77 | 78 | def create_server( 79 | work_dir: str, disk_image: str, vhost_user_test_server: str, 80 | pte_flush_threshold: int = 0, sector_size: int = 4096, 81 | block_size: int = 4096 82 | ) -> Generator[str, None, None]: 83 | socket_path = os.path.join(work_dir, "server.sock") 84 | 85 | process = subprocess.Popen([ 86 | vhost_user_test_server, "--disk", 87 | f"socket-path={socket_path},blk-file={disk_image}" 88 | f",serial=helloworld,pte-flush-threshold={pte_flush_threshold}" 89 | f",sector-size={sector_size},block-size={block_size}" 90 | ]) 91 | 92 | retry = 0 93 | retry_limit = 5 94 | 95 | while True: 96 | if os.path.exists(socket_path): 97 | break 98 | 99 | if retry < retry_limit: 100 | retry += 1 101 | time.sleep(10) 102 | else: 103 | raise RuntimeError("Failed to start test server!") 104 | 105 | yield socket_path 106 | 107 | process.send_signal(signal.SIGINT) 108 | process.wait(10) 109 | 110 | 111 | @pytest.fixture(scope="class") 112 | def server_socket( 113 | work_dir: str, disk_image: str, vhost_user_test_server: str 114 | ) -> Generator[str, None, None]: 115 | yield from create_server(work_dir, disk_image, vhost_user_test_server) 116 | 117 | 118 | @pytest.fixture(scope="class") 119 | def server_socket_with_pte_flush( 120 | request: pytest.FixtureRequest, work_dir: str, disk_image: str, 121 | vhost_user_test_server: str 122 | ) -> Generator[str, None, None]: 123 | yield from create_server(work_dir, disk_image, vhost_user_test_server, 124 | request.param) 125 | 126 | 127 | @pytest.fixture(scope="class") 128 | def server_socket_with_custom_sector_size( 129 | request: pytest.FixtureRequest, work_dir: str, disk_image: str, 130 | vhost_user_test_server: str 131 | ) -> Generator[str, None, None]: 132 | yield from create_server(work_dir, disk_image, vhost_user_test_server, 133 | 0, *request.param) 134 | 135 | 136 | def pretty_print_blkio_config(param: List[str]) -> str: 137 | return f"{param[0]}, blocksize={param[1]}" 138 | 139 | 140 | def check_run_blkio_bench( 141 | path: str, type: str, blocksize: int, time: int, socket: str, 142 | threads: int = 1 143 | ) -> None: 144 | subprocess.check_call([ 145 | path, f"--blocksize={blocksize}", f"--runtime={time}", 146 | f"--readwrite={type}", f"--num-threads={threads}", 147 | "virtio-blk-vhost-user", f"path={socket}" 148 | ], timeout=time + 10) 149 | 150 | 151 | class TestBasic: 152 | @pytest.mark.parametrize( 153 | 'config', 154 | [ 155 | ["read", 1024 * 1024], 156 | ["write", 1024 * 1024], 157 | ["randread", 4096], 158 | ["randwrite", 4096], 159 | ], 160 | ids=pretty_print_blkio_config 161 | ) 162 | def test_basic_operations( 163 | self, server_socket: str, blkio_bench: str, config: Tuple[str, int] 164 | ) -> None: 165 | check_run_blkio_bench(blkio_bench, *config, 30, server_socket) 166 | 167 | 168 | @pytest.mark.parametrize( 169 | 'server_socket_with_pte_flush, time', 170 | [ 171 | # Flush every 1 byte processed for 5 seconds 172 | [1, 5], 173 | # Flush every 50MiB processed for 30 seconds 174 | [50 * 1024 * 1024, 30], 175 | ], 176 | indirect=['server_socket_with_pte_flush'] 177 | ) 178 | class TestPTEFlush: 179 | def test_pte_flush( 180 | self, server_socket_with_pte_flush: str, time: int, 181 | blkio_bench: str 182 | ) -> None: 183 | check_run_blkio_bench(blkio_bench, "randread", 4096, time, 184 | server_socket_with_pte_flush) 185 | 186 | 187 | @pytest.mark.parametrize( 188 | 'server_socket_with_custom_sector_size, block_size', 189 | [ 190 | # Test different sector sizes 191 | # Format is: [[sector_size, block_size], blkio_bench_block_size] 192 | [[1024, 2048], 2048], 193 | [[4096, 4096], 4096], 194 | [[2048, 8192], 8192], 195 | ], 196 | indirect=['server_socket_with_custom_sector_size'] 197 | ) 198 | class TestSectorSizes: 199 | def test_sector_sizes( 200 | self, server_socket_with_custom_sector_size: str, block_size: int, 201 | blkio_bench: str 202 | ) -> None: 203 | check_run_blkio_bench(blkio_bench, "randread", block_size, 1, 204 | server_socket_with_custom_sector_size) 205 | -------------------------------------------------------------------------------- /platform.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | #ifdef __has_feature 19 | # define VHD_HAS_FEATURE(x) __has_feature(x) 20 | #else 21 | # define VHD_HAS_FEATURE(x) 0 22 | #endif 23 | 24 | #define HUGE_PAGE_SIZE 0x40000000 // 1G, works also for 2M pages alignment 25 | 26 | /*////////////////////////////////////////////////////////////////////////////*/ 27 | 28 | #if !defined(NDEBUG) 29 | # define VHD_DEBUG 30 | #endif 31 | 32 | /*////////////////////////////////////////////////////////////////////////////*/ 33 | 34 | #if !defined(containerof) 35 | # define containerof(ptr, type, member) \ 36 | ((type *) ((char *)(ptr) - offsetof(type, member))) 37 | #endif 38 | 39 | #if !defined(countof) 40 | # define countof(a) (sizeof(a) / sizeof(*a)) 41 | #endif 42 | 43 | #ifndef likely 44 | #define likely(x) __builtin_expect(!!(x), 1) 45 | #define unlikely(x) __builtin_expect(!!(x), 0) 46 | #endif 47 | 48 | #ifdef __cplusplus 49 | # define VHD_STATIC_ASSERT(pred) static_assert((pred), __STRINGIFY(pred)) 50 | #elif (__STDC_VERSION__ >= 201112L) 51 | # define VHD_STATIC_ASSERT(pred) _Static_assert((pred), __STRINGIFY(pred)) 52 | #else 53 | # error Implement me 54 | #endif 55 | 56 | /* TODO: compiler-specifics for non-gcc? */ 57 | #ifdef __GNUC__ 58 | # define __STRINGIFY(x) #x 59 | # define VHD_NORETURN __attribute__((noreturn)) 60 | # define VHD_TYPEOF __typeof 61 | # define VHD_PACKED __attribute__((packed)) 62 | 63 | #define VHD_ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 64 | 65 | /* Return 0-based index of first least significant bit set in 32-bit value */ 66 | static inline int vhd_find_first_bit32(uint32_t val) 67 | { 68 | VHD_STATIC_ASSERT(sizeof(val) == sizeof(int)); 69 | return __builtin_ctz(val); 70 | } 71 | 72 | /* Return 0-based index of first least significant bit set in 64-bit value */ 73 | static inline int vhd_find_first_bit64(uint64_t val) 74 | { 75 | VHD_STATIC_ASSERT(sizeof(val) == sizeof(long long)); 76 | return __builtin_ctzll(val); 77 | } 78 | 79 | #else 80 | # error Implement me 81 | #endif 82 | 83 | /* 84 | * MIN/MAX implementations with intuitive behavior: 85 | * - type safety 86 | * - exactly-once evaluation of both arguments 87 | * Note: unsuitable in constant expressions 88 | */ 89 | #define __safe_cmp(a, op, b) \ 90 | ({ \ 91 | typeof(1 ? (a) : (b)) _a = (a), _b = (b); \ 92 | _a op _b ? _a : _b; \ 93 | }) 94 | 95 | #undef MIN 96 | #define MIN(a, b) __safe_cmp(a, <, b) 97 | #undef MAX 98 | #define MAX(a, b) __safe_cmp(a, >, b) 99 | 100 | /*////////////////////////////////////////////////////////////////////////////*/ 101 | 102 | static inline void VHD_NORETURN _vhd_verify_helper( 103 | const char *what, 104 | const char *file, 105 | unsigned long line) 106 | { 107 | /* TODO: smarter logging */ 108 | fprintf(stderr, "Verify failed: \"%s\" at %s:%lu\n", what, file, line); 109 | abort(); 110 | } 111 | 112 | #define VHD_ASSERT(cond) assert(cond) 113 | #define VHD_UNREACHABLE() __builtin_unreachable() 114 | 115 | /* Verify is not compiled out in release builds */ 116 | #define VHD_VERIFY(cond) \ 117 | do { \ 118 | if (!(cond)) { \ 119 | _vhd_verify_helper(#cond, __FILE__, __LINE__); \ 120 | } \ 121 | } while (0) 122 | 123 | /*////////////////////////////////////////////////////////////////////////////*/ 124 | 125 | #ifdef VHD_MEMCHECK 126 | # include 127 | # define VHD_MEMCHECK_DEFINED(addr, len) \ 128 | VALGRIND_MAKE_MEM_DEFINED(addr, len) 129 | # define VHD_MEMCHECK_UNDEFINED(addr, len) \ 130 | VALGRIND_MAKE_MEM_UNDEFINED(addr, len) 131 | #else 132 | # define VHD_MEMCHECK_DEFINED(addr, len) 133 | # define VHD_MEMCHECK_UNDEFINED(addr, len) 134 | #endif 135 | 136 | /*////////////////////////////////////////////////////////////////////////////*/ 137 | 138 | #define VHD_ALIGN_UP(x, a) ({ \ 139 | VHD_TYPEOF(x) __mask = (VHD_TYPEOF(x))(a) - 1; \ 140 | ((x) + __mask) & ~__mask; \ 141 | }) 142 | #define VHD_ALIGN_DOWN(x, a) ((x) & ~((VHD_TYPEOF(x))(a) - 1)) 143 | #define VHD_IS_ALIGNED(x, a) (!((x) & ((VHD_TYPEOF(x))(a) - 1))) 144 | #define VHD_ALIGN_PTR_UP(x, a) (VHD_TYPEOF(x))VHD_ALIGN_UP((uintptr_t)x, a) 145 | 146 | static inline void *vhd_alloc(size_t bytes) 147 | { 148 | /* malloc actually accepts 0 sizes, but this is still most likely a bug.. */ 149 | VHD_ASSERT(bytes != 0); 150 | 151 | void *p = malloc(bytes); 152 | VHD_VERIFY(p != NULL); 153 | return p; 154 | } 155 | 156 | static inline void *vhd_zalloc(size_t bytes) 157 | { 158 | /* calloc actually accepts 0 sizes, but this is still most likely a bug.. */ 159 | VHD_ASSERT(bytes != 0); 160 | 161 | void *p = calloc(bytes, 1); 162 | VHD_VERIFY(p != NULL); 163 | return p; 164 | } 165 | 166 | static inline void *vhd_calloc(size_t nmemb, size_t size) 167 | { 168 | VHD_ASSERT(nmemb != 0 && size != 0); 169 | 170 | void *p = calloc(nmemb, size); 171 | VHD_VERIFY(p != NULL); 172 | return p; 173 | } 174 | 175 | /* TODO: aligned alloc */ 176 | 177 | static inline void vhd_free(void *p) 178 | { 179 | free(p); 180 | } 181 | 182 | static inline char *vhd_strdup(const char *s) __attribute__((malloc)); 183 | static inline char *vhd_strdup(const char *s) 184 | { 185 | size_t len; 186 | char *t; 187 | 188 | if (!s) { 189 | return NULL; 190 | } 191 | 192 | len = strlen(s) + 1; 193 | t = (char *)vhd_alloc(len); 194 | memcpy(t, s, len); 195 | return t; 196 | } 197 | 198 | static inline char *vhd_strdup_printf(const char *fmt, ...) 199 | __attribute__((format(printf, 1, 2), malloc)); 200 | static inline char *vhd_strdup_printf(const char *fmt, ...) 201 | { 202 | int len; 203 | size_t size; 204 | char *ret; 205 | va_list args; 206 | 207 | va_start(args, fmt); 208 | len = vsnprintf(NULL, 0, fmt, args); 209 | va_end(args); 210 | 211 | if (len < 0) { 212 | return NULL; 213 | } 214 | 215 | size = (size_t)len + 1; 216 | ret = (char *)vhd_alloc(size); 217 | 218 | va_start(args, fmt); 219 | len = vsnprintf(ret, size, fmt, args); 220 | va_end(args); 221 | 222 | if (len < 0) { 223 | vhd_free(ret); 224 | return NULL; 225 | } 226 | return ret; 227 | } 228 | 229 | int init_platform_page_size(void); 230 | 231 | extern size_t platform_page_size; 232 | 233 | #ifdef __cplusplus 234 | } 235 | #endif 236 | -------------------------------------------------------------------------------- /vdev.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "event.h" 6 | #include "queue.h" 7 | 8 | #include "virtio/virt_queue.h" 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | struct vhd_vdev; 15 | struct vhd_vring; 16 | struct vhd_request_queue; 17 | 18 | /** 19 | * Vhost device type description. 20 | */ 21 | struct vhd_vdev_type { 22 | /* Human-readable description */ 23 | const char *desc; 24 | 25 | /* Polymorphic type ops */ 26 | uint64_t (*get_features)(struct vhd_vdev *vdev); 27 | int (*set_features)(struct vhd_vdev *vdev, uint64_t features); 28 | size_t (*get_config)(struct vhd_vdev *vdev, void *cfgbuf, 29 | size_t bufsize, size_t offset); 30 | int (*dispatch_requests)(struct vhd_vdev *vdev, struct vhd_vring *vring); 31 | void (*free)(struct vhd_vdev *vdev); 32 | }; 33 | 34 | struct vhd_memory_map; 35 | struct vhd_memory_log; 36 | struct vhd_work; 37 | 38 | /** 39 | * Vhost generic device instance. 40 | * 41 | * Devices are polymorphic through their respective types. 42 | */ 43 | struct vhd_vdev { 44 | char *log_tag; 45 | 46 | /* Accosiated client private data */ 47 | void *priv; 48 | 49 | /* Device type description */ 50 | const struct vhd_vdev_type *type; 51 | 52 | /* Server socket fd when device is a vhost-user server */ 53 | int listenfd; 54 | struct vhd_io_handler *listen_handler; 55 | 56 | /* Connected device fd. Single active connection per device. */ 57 | int connfd; 58 | struct vhd_io_handler *conn_handler; 59 | 60 | /* Message currently being handled */ 61 | uint32_t req; 62 | 63 | /* Timing for message handling */ 64 | struct timespec msg_handling_started; 65 | int timerfd; 66 | struct vhd_io_handler *timer_handler; 67 | 68 | /* Attached request queues */ 69 | struct vhd_request_queue **rqs; 70 | int num_rqs; 71 | 72 | /* 73 | * Vhost protocol features which can be supported for this vdev and 74 | * those which have been actually enabled during negotiation. 75 | */ 76 | uint64_t supported_protocol_features; 77 | uint64_t negotiated_protocol_features; 78 | uint64_t supported_features; 79 | uint64_t negotiated_features; 80 | 81 | /* Maximum amount of request queues this device can support */ 82 | uint16_t num_queues; 83 | struct vhd_vring *vrings; /* Total num_queues elements */ 84 | 85 | /* Gets called after mapping guest memory region */ 86 | int (*map_cb)(void *addr, size_t len); 87 | 88 | /* Gets called before unmapping guest memory region */ 89 | int (*unmap_cb)(void *addr, size_t len); 90 | 91 | struct vhd_memory_map *memmap; 92 | struct vhd_memory_map *old_memmap; 93 | struct vhd_memory_log *memlog; 94 | struct vhd_memory_log *old_memlog; 95 | 96 | /** 97 | * Shared memory to store information about inflight requests and restore 98 | * virtqueue state after reconnect. 99 | */ 100 | struct inflight_split_region *inflight_mem; 101 | uint64_t inflight_size; 102 | 103 | size_t pte_flush_byte_threshold; 104 | int64_t bytes_left_before_pte_flush; 105 | 106 | /* #vrings which may have requests in flight */ 107 | uint16_t num_vrings_in_flight; 108 | /* #vrings started and haven't yet acknowledged stop */ 109 | uint16_t num_vrings_started; 110 | 111 | /* callback and arg to be called when the device is released */ 112 | void (*release_cb)(void *); 113 | void *release_arg; 114 | 115 | /** Global vdev list */ 116 | LIST_ENTRY(vhd_vdev) vdev_list; 117 | 118 | /* #vrings performing an action in response to a control message */ 119 | uint16_t num_vrings_handling_msg; 120 | /* function to call once the current message is handled in all vrings */ 121 | int (*handle_complete)(struct vhd_vdev *vdev); 122 | 123 | /* whether an ACK should be sent once the message is handled */ 124 | bool ack_pending; 125 | bool pte_flush_pending; 126 | 127 | /* fd to keep open until handle_complete and to close there */ 128 | int keep_fd; 129 | 130 | struct vhd_work *work; 131 | }; 132 | 133 | /** 134 | * Init new generic vhost device in server mode 135 | * @socket_path Listen socket path 136 | * @type Device type description 137 | * @vdev vdev instance to initialize 138 | * @max_queues Maximum number of queues this device can support 139 | * @rqs Associated request queues 140 | * @num_rqs Number of request queues 141 | * @priv User private data 142 | * @map_cb User function to call after mapping guest memory 143 | * @unmap_cb User function to call before unmapping guest memory 144 | * @pte_flush_byte_threshold 145 | * Number of bytes to process before flushing the PTEs 146 | * of the guest address space 147 | */ 148 | int vhd_vdev_init_server( 149 | struct vhd_vdev *vdev, 150 | const char *socket_path, 151 | const struct vhd_vdev_type *type, 152 | int max_queues, 153 | struct vhd_request_queue **rqs, int num_rqs, 154 | void *priv, 155 | int (*map_cb)(void *addr, size_t len), 156 | int (*unmap_cb)(void *addr, size_t len), 157 | size_t pte_flush_byte_threshold); 158 | 159 | /** 160 | * Stop vhost device. Once this returns no more new requests will reach the 161 | * backend. @release_cb(@release_arg) will be called once all requests are 162 | * completed and the associated resources released. 163 | */ 164 | int vhd_vdev_stop_server(struct vhd_vdev *vdev, 165 | void (*release_cb)(void *), void *release_arg); 166 | 167 | /** 168 | * Device vring instance 169 | */ 170 | struct vhd_vring { 171 | struct vhd_vdev *vdev; 172 | char *log_tag; 173 | 174 | int kickfd; 175 | int callfd; 176 | int errfd; 177 | 178 | /* started as seen from control plane */ 179 | bool started_in_ctl; 180 | /* requested to disconnect */ 181 | bool disconnecting; 182 | 183 | /* Client kick event */ 184 | struct vhd_io_handler *kick_handler; 185 | 186 | /* called in control plane once vring is drained */ 187 | int (*on_drain_cb)(struct vhd_vring *); 188 | 189 | /* 190 | * vq attributes that may change while vring is started; these are updated 191 | * in the control event loop and propagated via BH into vq 192 | */ 193 | struct { 194 | uint64_t desc_addr; 195 | uint64_t used_addr; 196 | uint64_t avail_addr; 197 | uint32_t flags; 198 | uint64_t used_gpa_base; 199 | void *desc; 200 | void *used; 201 | void *avail; 202 | struct vhd_memory_map *mm; 203 | struct vhd_memory_log *log; 204 | bool enabled; 205 | } shadow_vq; 206 | 207 | /* 208 | * the fields below are only accessed in dataplane unless the vring is 209 | * known to be stopped 210 | */ 211 | struct virtio_virtq vq; 212 | /* started as seen from dataplane */ 213 | bool started_in_rq; 214 | /* #requests pending completion */ 215 | uint16_t num_in_flight; 216 | /* #requests pending completion when the queue is requested to stop */ 217 | uint16_t num_in_flight_at_stop; 218 | }; 219 | 220 | #define VHD_VRING_FROM_VQ(ptr) containerof(ptr, struct vhd_vring, vq) 221 | 222 | struct vhd_request_queue *vhd_get_rq_for_vring(struct vhd_vring *vring); 223 | 224 | void vhd_vring_inc_in_flight(struct vhd_vring *vring); 225 | void vhd_vring_dec_in_flight(struct vhd_vring *vring); 226 | 227 | #ifdef __cplusplus 228 | } 229 | #endif 230 | -------------------------------------------------------------------------------- /docs/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # libvhost-server architecture 2 | 3 | `libvhost-server` (henceforth `libvhost`) is a component to help implementing 4 | vhost-user device servers. Its main purpose is to insulate the server from the 5 | vhost-user control protocol, memory mapping, address translation, virtio queue 6 | processing, and so on. 7 | 8 | It's designed to transfer requests and responses between the guest drivers and 9 | the backend efficiently, with as little added latency and device's mutual 10 | interference as possible. 11 | 12 | This component is implemented as a library. An application implementing a 13 | vhost-user device (slave in vhost-user speak) links to this library and calls 14 | into the API it exposes (so it's called _user_ hereinafter); this allows it to 15 | accept connections from a _client_ (vhost-user master, e.g. qemu) process its 16 | virtio queues. 17 | 18 | ## Lockless event loops and bottom halves 19 | 20 | For efficiency in a highly-concurrent environment the library implements 21 | lockless event loops: it tends to avoid using sleeping synchronization 22 | primitives like mutexes. 23 | 24 | To coordinate different contexts that execute event loops, _bottom halves_ are 25 | used: functions that are scheduled to run (soon) on the target event loop. 26 | 27 | ## Contexts of execution 28 | 29 | The library assumes there are different kinds of execution contexts: 30 | 31 | 1. dataplane aka request queue event loop 32 | 33 | This is where virtio queues are processed. There may be multiple request 34 | queue event loops. Typically every request queue event loop is run in its 35 | own thread. Every virtio queue is associated permanently with a request 36 | queue. _Currently all virtio queues of a device are associated with a 37 | single request queue, but this limitation will be lifted._ 38 | 39 | Request queue event loop is supposed to be run explicitly by the user. On 40 | each iteration, the event loop blocks until a host notification is signaled 41 | on any of its virtio queues. Once it's woken up, it extracts all available 42 | virtio elements from all signaled virtio queues and forms device requests 43 | out of them. It may then process some simple ones of them synchronously; 44 | otherwise it enqueues them in a double-ended queue, common for all 45 | associated virtio queues (this allows to avoid starvation). 46 | 47 | The user dequeues the requests from this request queue and submits them for 48 | asynchronous processing in another context outside of `libvhost` scope. 49 | 50 | Once the request is fully processed, it submits a completion function (via 51 | bottom half) back onto the request queue event loop; this leads to releasing 52 | the resources associated with the request and publishing the result to the 53 | client. 54 | 55 | 2. control event loop 56 | 57 | This is a **single** library-global event loop handling state transitions 58 | and vhost-user socket communications for all devices served by the library. 59 | The thread running the control event loop is owned by the library: it's 60 | created at the library initialization and stopped on deinitialization. 61 | 62 | 3. external contexts 63 | 64 | These are external contexts that initialize and deinitialize devices to be 65 | handled by the library. These operations are **not** lockless: the 66 | respective functions block until the control loop acknowledges the requested 67 | state transition of the device. 68 | 69 | ## Device state machine 70 | 71 | Device (`struct vhd_vdev`) state transitions happen in response to client 72 | actions -- connect, disconnect, vhost-user control messages, and user actions 73 | -- device start and stop. They all happen in the control event loop. 74 | 75 | However, some of the device state transitions require associated state 76 | transitions in the device virtio queues (`struct vhd_vring`). In order to 77 | maintain lockless nature of the library, such device state transitions happen 78 | in several stages: 79 | 80 | - first the transition is started in the control event loop where input 81 | parameters are verified and internal state is prepared, to be later exposed 82 | in the dataplane 83 | 84 | - then corresponding state transitions of the virtio queues in the respective 85 | request queue event loop(s) are scheduled via bottom halves 86 | 87 | - those, in turn, put the internal state prepared earlier into effect in the 88 | dataplane and signal completion of the transition via bottom halves back to 89 | the control event loop 90 | 91 | - then the device state transition finishes and the reply is sent to the 92 | client, if needed. 93 | 94 | The messages from a single client are never handled concurrently: upon 95 | reception of a message the state machine suspends reception of further messages 96 | on the socket until the current one is fully handled and the reply is sent. 97 | The only action which may intervene is the device disconnection, either due to 98 | the client shutting down its end or due to the external context issuing the 99 | device stop. 100 | 101 | ### User-initiated device stop or client disconnection 102 | 103 | Certain complexity arises from the fact that when the device disconnection 104 | happens some requests on the device may have been dequeued from the request 105 | queue and submitted to the backend for asynchronous handling. 106 | 107 | Therefore disconnection is multi-stage: 108 | 109 | - if the disconnection is initiated by the user doing a device stop call in an 110 | external context, it passes the request to the device in the control event 111 | loop and blocks on a semaphore 112 | 113 | - the socket connected to the client is closed 114 | 115 | - all virtio queues are requested to stop via bottom halves in the request 116 | queue event loop(s) 117 | 118 | - in the request queue event loop 119 | 120 | * the virtio queue is stopped, i.e. no more requests are fetched from the 121 | virtio queue 122 | 123 | * the requests that have already been fetched from the virtio queue but still 124 | remain in the request queue are canceled: completed immediately with a 125 | special status such that the completion is not exposed to the client, 126 | effectively dropping the request, in the expectation that it will be 127 | resubmitted by the vhost-user in_flight mechanism; this ensures that no 128 | more requests from this virtio queue will enter the backend 129 | 130 | * the virtio queue acknowledges the stop to the device in the control event 131 | loop context via a bottom half 132 | 133 | - if the disconnection was user-initiated, once the device sees all its virtio 134 | queues acknowledge the stop, it releases the semaphore so that the device 135 | stop call unblocks and returns in its external context; from this point on 136 | the backend is guaranteed that no more requests will be submitted 137 | 138 | - once all device requests that were caught in the backend by the disconnection 139 | are completed, leaving no more requests in the whole pipeline and thus 140 | ensuring that nothing will touch the guest memory any more, the device 141 | proceeds with the cleanup: 142 | 143 | * if it the disconnection was user-initiated, the device shuts itself down, 144 | closes the listening socket, releases remaining resources and executes a 145 | previously set up callback to inform the external context that the device 146 | is fully terminated and freed 147 | 148 | * otherwise the device resets its state and resumes listening for incoming 149 | connections. 150 | 151 | ### Live migration support 152 | 153 | Live migration support basically adheres to the virtio spec, with a notable 154 | extension, which appears underspecified there: 155 | 156 | Before the `VHOST_USER_GET_VRING_BASE` message is replied to, all requests in 157 | the virtio queue are drained and completed to the client, leaving no in-flight 158 | requests and thus making it safe to resume operation upon migration. 159 | 160 | ### Reconnection support 161 | 162 | The library supports starting in a mode where the client survived the server 163 | premature termination and wants to re-establish the connection and resume 164 | operation. 165 | 166 | The library basically adheres to the spec in this regard, with a few points of 167 | note: 168 | 169 | - the in-flight region is initially created in a memfd (there's no requirement 170 | that it's in memfd when the connection is re-established0) 171 | 172 | - when the device is stopped by the user while there still is an open 173 | connection with the client, the requests that happen to be in flight are 174 | canceled, ensuring that the backend internals don't touch the client's memory 175 | after the stop call is acknowledged. 176 | -------------------------------------------------------------------------------- /server.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "platform.h" 4 | #include "server_internal.h" 5 | #include "queue.h" 6 | #include "bio.h" 7 | #include "logging.h" 8 | #include "vdev.h" 9 | 10 | #define VHOST_EVENT_LOOP_EVENTS 128 11 | 12 | static struct vhd_event_loop *g_vhost_evloop; 13 | static pthread_t g_vhost_thread; 14 | 15 | static inline void free_vhost_event_loop(void) 16 | { 17 | vhd_free_event_loop(g_vhost_evloop); 18 | g_vhost_evloop = NULL; 19 | } 20 | 21 | static __thread bool g_is_ctl_thread; 22 | 23 | bool vhd_in_ctl_thread(void) 24 | { 25 | return g_is_ctl_thread; 26 | } 27 | 28 | static void *vhost_evloop_func(void *arg) 29 | { 30 | int res; 31 | 32 | g_is_ctl_thread = true; 33 | 34 | do { 35 | res = vhd_run_event_loop(g_vhost_evloop, -1); 36 | } while (res == -EAGAIN); 37 | 38 | if (res < 0) { 39 | VHD_LOG_ERROR("vhost event loop iteration failed: %d", res); 40 | } 41 | 42 | return NULL; 43 | } 44 | 45 | int vhd_start_vhost_server(log_function log_fn) 46 | { 47 | int res; 48 | 49 | res = init_platform_page_size(); 50 | if (res != 0) { 51 | VHD_LOG_ERROR("failed to init platform page size: %d", res); 52 | return -res; 53 | } 54 | 55 | if (g_vhost_evloop != NULL) { 56 | return 0; 57 | } 58 | 59 | g_log_fn = log_fn; 60 | 61 | g_vhost_evloop = vhd_create_event_loop(VHOST_EVENT_LOOP_EVENTS); 62 | if (!g_vhost_evloop) { 63 | VHD_LOG_ERROR("failed to create vhost event loop"); 64 | return -EIO; 65 | } 66 | 67 | res = pthread_create(&g_vhost_thread, NULL, vhost_evloop_func, NULL); 68 | if (res != 0) { 69 | VHD_LOG_ERROR("failed to start vhost event loop thread: %d", res); 70 | free_vhost_event_loop(); 71 | return -res; 72 | } 73 | 74 | return 0; 75 | } 76 | 77 | void vhd_stop_vhost_server(void) 78 | { 79 | if (!g_vhost_evloop) { 80 | return; 81 | } 82 | 83 | vhd_terminate_event_loop(g_vhost_evloop); 84 | pthread_join(g_vhost_thread, NULL); 85 | free_vhost_event_loop(); 86 | } 87 | 88 | struct vhd_io_handler *vhd_add_vhost_io_handler(int fd, 89 | int (*read)(void *opaque), 90 | void *opaque) 91 | { 92 | return vhd_add_io_handler(g_vhost_evloop, fd, read, opaque); 93 | } 94 | 95 | void vhd_run_in_ctl(void (*cb)(void *), void *opaque) 96 | { 97 | vhd_bh_schedule_oneshot(g_vhost_evloop, cb, opaque); 98 | } 99 | 100 | int vhd_submit_ctl_work_and_wait(void (*func)(struct vhd_work *, void *), 101 | void *opaque) 102 | { 103 | return vhd_submit_work_and_wait(g_vhost_evloop, func, opaque); 104 | } 105 | 106 | /*////////////////////////////////////////////////////////////////////////////*/ 107 | 108 | /* 109 | * Request queues 110 | */ 111 | 112 | typedef SLIST_HEAD(, vhd_io) vhd_io_list; 113 | 114 | /* TODO: bounded queue */ 115 | struct vhd_request_queue { 116 | struct vhd_event_loop *evloop; 117 | 118 | TAILQ_HEAD(, vhd_io) submission; 119 | TAILQ_HEAD(, vhd_io) inflight; 120 | vhd_io_list completion; 121 | 122 | struct vhd_bh *completion_bh; 123 | struct vhd_rq_metrics metrics; 124 | }; 125 | 126 | void vhd_run_in_rq(struct vhd_request_queue *rq, void (*cb)(void *), 127 | void *opaque) 128 | { 129 | vhd_bh_schedule_oneshot(rq->evloop, cb, opaque); 130 | } 131 | 132 | static void req_complete(struct vhd_io *io) 133 | { 134 | /* completion_handler destroys bio. save vring for unref */ 135 | struct vhd_vring *vring = io->vring; 136 | io->completion_handler(io); 137 | vhd_vring_dec_in_flight(vring); 138 | } 139 | 140 | static void rq_complete_bh(void *opaque) 141 | { 142 | struct vhd_request_queue *rq = opaque; 143 | vhd_io_list io_list, io_list_reverse; 144 | 145 | SLIST_INIT(&io_list); 146 | SLIST_INIT(&io_list_reverse); 147 | /* steal completion list from rq, swap for a fresh one */ 148 | SLIST_MOVE_ATOMIC(&io_list_reverse, &rq->completion); 149 | 150 | /* the list was filled LIFO, we want the completions FIFO */ 151 | for (;;) { 152 | struct vhd_io *io = SLIST_FIRST(&io_list_reverse); 153 | if (!io) { 154 | break; 155 | } 156 | SLIST_REMOVE_HEAD(&io_list_reverse, completion_link); 157 | SLIST_INSERT_HEAD(&io_list, io, completion_link); 158 | } 159 | 160 | for (;;) { 161 | struct vhd_io *io = SLIST_FIRST(&io_list); 162 | if (!io) { 163 | break; 164 | } 165 | SLIST_REMOVE_HEAD(&io_list, completion_link); 166 | TAILQ_REMOVE(&rq->inflight, io, inflight_link); 167 | req_complete(io); 168 | ++rq->metrics.completed; 169 | } 170 | 171 | struct vhd_io *io = TAILQ_FIRST(&rq->inflight); 172 | rq->metrics.oldest_inflight_ts = io ? io->ts : 0; 173 | } 174 | 175 | struct vhd_request_queue *vhd_create_request_queue(void) 176 | { 177 | struct vhd_request_queue *rq = vhd_alloc(sizeof(*rq)); 178 | 179 | rq->evloop = vhd_create_event_loop(VHD_EVENT_LOOP_DEFAULT_MAX_EVENTS); 180 | if (!rq->evloop) { 181 | vhd_free(rq); 182 | return NULL; 183 | } 184 | 185 | TAILQ_INIT(&rq->submission); 186 | TAILQ_INIT(&rq->inflight); 187 | SLIST_INIT(&rq->completion); 188 | rq->completion_bh = vhd_bh_new(rq->evloop, rq_complete_bh, rq); 189 | memset(&rq->metrics, 0, sizeof(rq->metrics)); 190 | return rq; 191 | } 192 | 193 | void vhd_release_request_queue(struct vhd_request_queue *rq) 194 | { 195 | assert(TAILQ_EMPTY(&rq->submission)); 196 | assert(TAILQ_EMPTY(&rq->inflight)); 197 | assert(SLIST_EMPTY(&rq->completion)); 198 | vhd_bh_delete(rq->completion_bh); 199 | vhd_free_event_loop(rq->evloop); 200 | vhd_free(rq); 201 | } 202 | 203 | struct vhd_io_handler *vhd_add_rq_io_handler(struct vhd_request_queue *rq, 204 | int fd, int (*read)(void *opaque), 205 | void *opaque) 206 | { 207 | return vhd_add_io_handler(rq->evloop, fd, read, opaque); 208 | } 209 | 210 | int vhd_run_queue(struct vhd_request_queue *rq) 211 | { 212 | return vhd_run_event_loop(rq->evloop, -1); 213 | } 214 | 215 | void vhd_stop_queue(struct vhd_request_queue *rq) 216 | { 217 | vhd_terminate_event_loop(rq->evloop); 218 | } 219 | 220 | bool vhd_dequeue_request(struct vhd_request_queue *rq, 221 | struct vhd_request *out_req) 222 | { 223 | struct vhd_io *io = TAILQ_FIRST(&rq->submission); 224 | 225 | if (!io) { 226 | return false; 227 | } 228 | 229 | TAILQ_REMOVE(&rq->submission, io, submission_link); 230 | 231 | time_t now = time(NULL); 232 | io->ts = now; 233 | TAILQ_INSERT_TAIL(&rq->inflight, io, inflight_link); 234 | if (!rq->metrics.oldest_inflight_ts) { 235 | rq->metrics.oldest_inflight_ts = now; 236 | } 237 | 238 | out_req->vdev = io->vring->vdev; 239 | out_req->io = io; 240 | 241 | catomic_inc(&rq->metrics.dequeued); 242 | return true; 243 | } 244 | 245 | int vhd_enqueue_request(struct vhd_request_queue *rq, struct vhd_io *io) 246 | { 247 | vhd_vring_inc_in_flight(io->vring); 248 | 249 | TAILQ_INSERT_TAIL(&rq->submission, io, submission_link); 250 | catomic_inc(&rq->metrics.enqueued); 251 | return 0; 252 | } 253 | 254 | void vhd_cancel_queued_requests(struct vhd_request_queue *rq, 255 | const struct vhd_vring *vring) 256 | { 257 | struct vhd_io *io = TAILQ_FIRST(&rq->submission); 258 | 259 | while (io) { 260 | struct vhd_io *next = TAILQ_NEXT(io, submission_link); 261 | if (unlikely(io->vring == vring)) { 262 | TAILQ_REMOVE(&rq->submission, io, submission_link); 263 | io->status = VHD_BDEV_CANCELED; 264 | req_complete(io); 265 | catomic_inc(&rq->metrics.cancelled); 266 | } 267 | io = next; 268 | } 269 | } 270 | 271 | /* 272 | * can be called from arbitrary thread; will schedule completion on the rq 273 | * event loop 274 | */ 275 | void vhd_complete_bio(struct vhd_io *io, enum vhd_bdev_io_result status) 276 | { 277 | struct vhd_request_queue *rq; 278 | 279 | io->status = status; 280 | rq = vhd_get_rq_for_vring(io->vring); 281 | 282 | /* 283 | * if this is not the first completion on the list scheduling the bh can be 284 | * skipped because the first one must have done so 285 | */ 286 | if (!SLIST_INSERT_HEAD_ATOMIC(&rq->completion, io, completion_link)) { 287 | vhd_bh_schedule(rq->completion_bh); 288 | } 289 | catomic_inc(&rq->metrics.completions_received); 290 | } 291 | 292 | void vhd_get_rq_stat(struct vhd_request_queue *rq, 293 | struct vhd_rq_metrics *metrics) 294 | { 295 | *metrics = rq->metrics; 296 | } 297 | -------------------------------------------------------------------------------- /memmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "queue.h" 7 | #include "memmap.h" 8 | #include "platform.h" 9 | #include "logging.h" 10 | #include "objref.h" 11 | #include "server_internal.h" 12 | 13 | struct vhd_mmap_callbacks { 14 | /* gets called after mapping guest memory region */ 15 | int (*map_cb)(void *addr, size_t len); 16 | /* gets called before unmapping guest memory region */ 17 | int (*unmap_cb)(void *addr, size_t len); 18 | }; 19 | 20 | struct vhd_memory_region { 21 | struct objref ref; 22 | 23 | /* start of the region in guest physical space */ 24 | uint64_t gpa; 25 | /* start of the region in master's virtual space */ 26 | uint64_t uva; 27 | /* start of the region in this process' virtual space */ 28 | void *ptr; 29 | /* region size */ 30 | size_t size; 31 | /* offset of the region from the file base */ 32 | off_t offset; 33 | 34 | /* unique identifiers of this region for caching purposes */ 35 | dev_t device; 36 | ino_t inode; 37 | 38 | /* 39 | * file descriptor this region was created from. Note that this is only 40 | * set to a valid value if the region was created with preserve_fd, -1 41 | * otherwise. 42 | */ 43 | int fd; 44 | 45 | /* callbacks associated with this memory region */ 46 | struct vhd_mmap_callbacks callbacks; 47 | 48 | LIST_ENTRY(vhd_memory_region) region_link; 49 | }; 50 | 51 | static LIST_HEAD(, vhd_memory_region) g_regions = 52 | LIST_HEAD_INITIALIZER(g_regions); 53 | 54 | size_t platform_page_size; 55 | 56 | static int region_init_id(struct vhd_memory_region *reg, int fd, bool preserve_fd) 57 | { 58 | struct stat stat; 59 | 60 | if (preserve_fd) { 61 | reg->fd = dup(fd); 62 | if (reg->fd < 0) { 63 | int err = errno; 64 | VHD_LOG_ERROR("unable to dup memory region %p-%p fd: %s", 65 | reg->ptr, reg->ptr + reg->size, strerror(err)); 66 | return -err; 67 | } 68 | } else { 69 | reg->fd = -1; 70 | } 71 | 72 | if (fstat(fd, &stat) < 0) { 73 | return 0; 74 | } 75 | 76 | reg->device = stat.st_dev; 77 | reg->inode = stat.st_ino; 78 | 79 | return 0; 80 | } 81 | 82 | /* 83 | * This should be no less than VHOST_USER_MEM_REGIONS_MAX, to accept any 84 | * allowed VHOST_USER_SET_MEM_TABLE message. The master may use more via 85 | * VHOST_USER_ADD_MEM_REG message if VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 86 | * is negotiated. 87 | */ 88 | #define VHD_RAM_SLOTS_MAX 32 89 | 90 | size_t vhd_memmap_max_memslots(void) 91 | { 92 | return VHD_RAM_SLOTS_MAX; 93 | } 94 | 95 | struct vhd_memory_map { 96 | struct objref ref; 97 | 98 | struct vhd_mmap_callbacks callbacks; 99 | 100 | /* actual number of slots used */ 101 | unsigned num; 102 | struct vhd_memory_region *regions[VHD_RAM_SLOTS_MAX]; 103 | }; 104 | 105 | /* 106 | * Returns actual pointer where uva points to 107 | * or NULL in case of mapping absence 108 | */ 109 | void *uva_to_ptr(struct vhd_memory_map *mm, uint64_t uva) 110 | { 111 | unsigned i; 112 | 113 | for (i = 0; i < mm->num; i++) { 114 | struct vhd_memory_region *reg = mm->regions[i]; 115 | if (uva >= reg->uva && uva - reg->uva < reg->size) { 116 | return reg->ptr + (uva - reg->uva); 117 | } 118 | } 119 | 120 | return NULL; 121 | } 122 | 123 | static void *map_memory(size_t len, int fd, off_t offset) 124 | { 125 | size_t aligned_len, map_len; 126 | void *addr; 127 | 128 | /* 129 | * Some apps map memory in very small chunks, make sure it's at least the 130 | * size of a page so that remap doesn't fail later on. 131 | */ 132 | len = VHD_ALIGN_UP(len, platform_page_size); 133 | 134 | aligned_len = VHD_ALIGN_PTR_UP(len, HUGE_PAGE_SIZE); 135 | map_len = aligned_len + HUGE_PAGE_SIZE + platform_page_size; 136 | 137 | char *map = mmap(NULL, map_len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 138 | 0); 139 | if (map == MAP_FAILED) { 140 | VHD_LOG_ERROR("unable to map memory: %s", strerror(errno)); 141 | return MAP_FAILED; 142 | } 143 | 144 | char *aligned_addr = VHD_ALIGN_PTR_UP(map + platform_page_size, HUGE_PAGE_SIZE); 145 | addr = mmap(aligned_addr, len, PROT_READ | PROT_WRITE, 146 | MAP_SHARED | MAP_FIXED, fd, offset); 147 | if (addr == MAP_FAILED) { 148 | VHD_LOG_ERROR("unable to remap memory region %p-%p: %s", aligned_addr, 149 | aligned_addr + len, strerror(errno)); 150 | munmap(map, map_len); 151 | return MAP_FAILED; 152 | } 153 | aligned_addr = addr; 154 | 155 | size_t tail_len = aligned_len - len; 156 | if (tail_len) { 157 | char *tail = aligned_addr + len; 158 | addr = mmap(tail, tail_len, PROT_READ | PROT_WRITE, 159 | MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); 160 | if (addr == MAP_FAILED) { 161 | VHD_LOG_ERROR("unable to remap memory region %p-%p: %s", tail, 162 | tail + tail_len, strerror(errno)); 163 | munmap(map, map_len); 164 | return MAP_FAILED; 165 | } 166 | } 167 | 168 | char *start = aligned_addr - platform_page_size; 169 | char *end = aligned_addr + aligned_len + platform_page_size; 170 | munmap(map, start - map); 171 | munmap(end, map + map_len - end); 172 | 173 | return aligned_addr; 174 | } 175 | 176 | static int unmap_memory(void *addr, size_t len) 177 | { 178 | size_t map_len = VHD_ALIGN_PTR_UP(len, HUGE_PAGE_SIZE) + platform_page_size * 2; 179 | char *map = addr - platform_page_size; 180 | return munmap(map, map_len); 181 | } 182 | 183 | static int map_region(struct vhd_memory_region *region, uint64_t gpa, 184 | uint64_t uva, size_t size, int fd, off_t offset, 185 | bool preserve_fd) 186 | { 187 | void *ptr; 188 | int ret; 189 | 190 | ptr = map_memory(size, fd, offset); 191 | if (ptr == MAP_FAILED) { 192 | int ret = -errno; 193 | VHD_LOG_ERROR("can't mmap memory: %s", strerror(-ret)); 194 | return ret; 195 | } 196 | 197 | region->ptr = ptr; 198 | region->gpa = gpa; 199 | region->uva = uva; 200 | region->size = size; 201 | region->offset = offset; 202 | 203 | ret = region_init_id(region, fd, preserve_fd); 204 | if (ret < 0) { 205 | munmap(ptr, size); 206 | return ret; 207 | } 208 | 209 | if (region->callbacks.map_cb) { 210 | size_t len = VHD_ALIGN_PTR_UP(size, HUGE_PAGE_SIZE); 211 | ret = region->callbacks.map_cb(ptr, len); 212 | if (ret < 0) { 213 | VHD_LOG_ERROR("map callback failed for region %p-%p: %s", 214 | ptr, ptr + len, strerror(-ret)); 215 | munmap(ptr, size); 216 | return ret; 217 | } 218 | } 219 | 220 | /* Mark memory as defined explicitly */ 221 | VHD_MEMCHECK_DEFINED(ptr, size); 222 | 223 | return 0; 224 | } 225 | 226 | static int unmap_region(struct vhd_memory_region *reg) 227 | { 228 | int ret; 229 | 230 | if (reg->callbacks.unmap_cb) { 231 | size_t len = VHD_ALIGN_PTR_UP(reg->size, HUGE_PAGE_SIZE); 232 | ret = reg->callbacks.unmap_cb(reg->ptr, len); 233 | if (ret < 0) { 234 | VHD_LOG_ERROR("unmap callback failed for region %p-%p: %s", 235 | reg->ptr, reg->ptr + reg->size, strerror(-ret)); 236 | return ret; 237 | } 238 | } 239 | 240 | ret = unmap_memory(reg->ptr, reg->size); 241 | if (ret < 0) { 242 | VHD_LOG_ERROR("failed to unmap region at %p", reg->ptr); 243 | return ret; 244 | } 245 | 246 | return 0; 247 | } 248 | 249 | static void region_do_release(struct vhd_memory_region *reg) 250 | { 251 | VHD_ASSERT(vhd_in_ctl_thread()); 252 | 253 | LIST_REMOVE(reg, region_link); 254 | unmap_region(reg); 255 | if (reg->fd >= 0) { 256 | close(reg->fd); 257 | } 258 | vhd_free(reg); 259 | } 260 | 261 | static void reap_regions_bh(void *unused) 262 | { 263 | struct vhd_memory_region *reg, *tmp_reg; 264 | 265 | VHD_ASSERT(vhd_in_ctl_thread()); 266 | 267 | LIST_FOREACH_SAFE(reg, &g_regions, region_link, tmp_reg) { 268 | if (objref_read(®->ref) != 0) { 269 | continue; 270 | } 271 | 272 | region_do_release(reg); 273 | } 274 | } 275 | 276 | static void region_release(struct objref *objref) 277 | { 278 | struct vhd_memory_region *reg = 279 | containerof(objref, struct vhd_memory_region, ref); 280 | 281 | if (vhd_in_ctl_thread()) { 282 | /* 283 | * Only the control thread gets the right to actually delete regions. 284 | * All other threads do it by submitting control work. 285 | */ 286 | region_do_release(reg); 287 | return; 288 | } 289 | 290 | vhd_run_in_ctl(reap_regions_bh, NULL); 291 | } 292 | 293 | static void region_ref(struct vhd_memory_region *reg) 294 | { 295 | objref_get(®->ref); 296 | } 297 | 298 | static void region_unref(struct vhd_memory_region *reg) 299 | { 300 | objref_put(®->ref); 301 | } 302 | 303 | static inline struct vhd_memory_region *region_get_cached( 304 | uint64_t gpa, uint64_t uva, 305 | size_t size, int fd, 306 | off_t offset, 307 | struct vhd_mmap_callbacks *callbacks, 308 | bool preserve_fd 309 | ) 310 | { 311 | struct vhd_memory_region *region; 312 | struct stat stat; 313 | 314 | if (fstat(fd, &stat) < 0) { 315 | return NULL; 316 | } 317 | 318 | LIST_FOREACH(region, &g_regions, region_link) { 319 | if (region->inode != stat.st_ino || region->device != stat.st_dev) { 320 | continue; 321 | } 322 | if (region->gpa != gpa || region->uva != uva || 323 | region->size != size || region->offset != offset) { 324 | continue; 325 | } 326 | if (region->callbacks.map_cb != callbacks->map_cb || 327 | region->callbacks.unmap_cb != callbacks->unmap_cb) { 328 | continue; 329 | } 330 | if (preserve_fd && region->fd == -1) { 331 | continue; 332 | } 333 | 334 | region_ref(region); 335 | return region; 336 | } 337 | 338 | return NULL; 339 | } 340 | 341 | static void memmap_release(struct objref *objref) 342 | { 343 | struct vhd_memory_map *mm = 344 | containerof(objref, struct vhd_memory_map, ref); 345 | unsigned i; 346 | 347 | for (i = 0; i < mm->num; i++) { 348 | region_unref(mm->regions[i]); 349 | } 350 | 351 | vhd_free(mm); 352 | } 353 | 354 | void vhd_memmap_ref(struct vhd_memory_map *mm) __attribute__ ((weak)); 355 | void vhd_memmap_ref(struct vhd_memory_map *mm) 356 | { 357 | objref_get(&mm->ref); 358 | } 359 | 360 | void vhd_memmap_unref(struct vhd_memory_map *mm) __attribute__ ((weak)); 361 | void vhd_memmap_unref(struct vhd_memory_map *mm) 362 | { 363 | objref_put(&mm->ref); 364 | } 365 | 366 | uint64_t ptr_to_gpa(struct vhd_memory_map *mm, void *ptr) 367 | { 368 | unsigned i; 369 | for (i = 0; i < mm->num; ++i) { 370 | struct vhd_memory_region *reg = mm->regions[i]; 371 | if (ptr >= reg->ptr && ptr < reg->ptr + reg->size) { 372 | return (ptr - reg->ptr) + reg->gpa; 373 | } 374 | } 375 | 376 | VHD_LOG_WARN("Failed to translate ptr %p to gpa", ptr); 377 | return TRANSLATION_FAILED; 378 | } 379 | 380 | void *gpa_range_to_ptr(struct vhd_memory_map *mm, 381 | uint64_t gpa, size_t len) __attribute__ ((weak)); 382 | void *gpa_range_to_ptr(struct vhd_memory_map *mm, uint64_t gpa, size_t len) 383 | { 384 | unsigned i; 385 | 386 | for (i = 0; i < mm->num; i++) { 387 | struct vhd_memory_region *reg = mm->regions[i]; 388 | if (gpa >= reg->gpa && gpa - reg->gpa < reg->size) { 389 | /* 390 | * Check (overflow-safe) that length fits in a single region. 391 | * 392 | * TODO: should we handle gpa areas that cross region boundaries 393 | * but are otherwise valid? 394 | */ 395 | if (len > reg->size || gpa - reg->gpa + len > reg->size) { 396 | return NULL; 397 | } 398 | 399 | return reg->ptr + (gpa - reg->gpa); 400 | } 401 | } 402 | 403 | return NULL; 404 | } 405 | 406 | struct vhd_memory_map *vhd_memmap_new(int (*map_cb)(void *, size_t), 407 | int (*unmap_cb)(void *, size_t)) 408 | { 409 | struct vhd_memory_map *mm = vhd_alloc(sizeof(*mm)); 410 | *mm = (struct vhd_memory_map) { 411 | .callbacks = (struct vhd_mmap_callbacks) { 412 | .map_cb = map_cb, 413 | .unmap_cb = unmap_cb, 414 | } 415 | }; 416 | 417 | objref_init(&mm->ref, memmap_release); 418 | return mm; 419 | } 420 | 421 | struct vhd_memory_map *vhd_memmap_dup(struct vhd_memory_map *mm) 422 | { 423 | size_t i; 424 | struct vhd_memory_map *new_mm = vhd_alloc(sizeof(*mm)); 425 | 426 | new_mm->callbacks = mm->callbacks; 427 | new_mm->num = mm->num; 428 | objref_init(&new_mm->ref, memmap_release); 429 | 430 | for (i = 0; i < mm->num; i++) { 431 | struct vhd_memory_region *reg = mm->regions[i]; 432 | region_ref(reg); 433 | new_mm->regions[i] = reg; 434 | } 435 | 436 | return new_mm; 437 | } 438 | 439 | static int region_create( 440 | uint64_t gpa, uint64_t uva, size_t size, int fd, 441 | off_t offset, struct vhd_mmap_callbacks callbacks, 442 | bool preserve_fd, struct vhd_memory_region **out_region) 443 | { 444 | struct vhd_memory_region *region; 445 | int ret; 446 | 447 | region = vhd_calloc(1, sizeof(*region)); 448 | *region = (struct vhd_memory_region) { 449 | .callbacks = callbacks, 450 | }; 451 | 452 | objref_init(®ion->ref, region_release); 453 | 454 | ret = map_region(region, gpa, uva, size, fd, offset, preserve_fd); 455 | if (ret < 0) { 456 | vhd_free(region); 457 | return ret; 458 | } 459 | 460 | LIST_INSERT_HEAD(&g_regions, region, region_link); 461 | *out_region = region; 462 | return 0; 463 | } 464 | 465 | struct vhd_memory_map *vhd_memmap_dup_remap(struct vhd_memory_map *mm) 466 | { 467 | int ret; 468 | size_t i; 469 | struct vhd_memory_map *new_mm; 470 | 471 | // Verify that the memmap was created with preserve_fd=true 472 | for (i = 0; i < mm->num; i++) { 473 | if (unlikely(mm->regions[i]->fd < 0)) { 474 | VHD_LOG_ERROR("attempting to remap a memory map without preserved" 475 | " fds"); 476 | return NULL; 477 | } 478 | } 479 | 480 | new_mm = vhd_alloc(sizeof(*mm)); 481 | new_mm->callbacks = mm->callbacks; 482 | new_mm->num = mm->num; 483 | objref_init(&new_mm->ref, memmap_release); 484 | 485 | for (i = 0; i < mm->num; i++) { 486 | struct vhd_memory_region *reg = mm->regions[i]; 487 | ret = region_create(reg->gpa, reg->uva, reg->size, 488 | reg->fd, reg->offset, mm->callbacks, 489 | true, &new_mm->regions[i]); 490 | 491 | if (unlikely(ret < 0)) { 492 | while (i-- > 0) { 493 | region_unref(new_mm->regions[i]); 494 | } 495 | vhd_free(new_mm); 496 | return NULL; 497 | } 498 | } 499 | 500 | return new_mm; 501 | } 502 | 503 | int vhd_memmap_add_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva, 504 | size_t size, int fd, off_t offset, bool preserve_fd) 505 | { 506 | int ret; 507 | unsigned i; 508 | struct vhd_memory_region *region; 509 | 510 | /* check for overflow */ 511 | if (gpa + size < gpa || uva + size < uva) { 512 | return -EINVAL; 513 | } 514 | /* check for spare slots */ 515 | if (mm->num == VHD_RAM_SLOTS_MAX) { 516 | return -ENOBUFS; 517 | } 518 | /* check for intersection with existing slots */ 519 | for (i = 0; i < mm->num; i++) { 520 | struct vhd_memory_region *reg = mm->regions[i]; 521 | if (reg->gpa + reg->size <= gpa || gpa + size <= reg->gpa || 522 | reg->uva + reg->size <= uva || uva + size <= reg->uva) { 523 | continue; 524 | } 525 | return -EINVAL; 526 | } 527 | 528 | /* find appropriate position to keep ascending order in gpa */ 529 | for (i = mm->num; i > 0; i--) { 530 | struct vhd_memory_region *reg = mm->regions[i - 1]; 531 | if (reg->gpa < gpa) { 532 | break; 533 | } 534 | } 535 | 536 | region = region_get_cached(gpa, uva, size, fd, offset, &mm->callbacks, 537 | preserve_fd); 538 | if (region == NULL) { 539 | ret = region_create(gpa, uva, size, fd, offset, mm->callbacks, 540 | preserve_fd, ®ion); 541 | if (ret < 0) { 542 | return ret; 543 | } 544 | } else { 545 | VHD_LOG_INFO( 546 | "region %jd-%ju (GPA 0x%016"PRIX64" -> 0x%016"PRIX64") cache hit, " 547 | "reusing (%u refs total)", region->device, region->inode, 548 | region->gpa, region->gpa + region->size, objref_read(®ion->ref) 549 | ); 550 | } 551 | 552 | if (i < mm->num) { 553 | memmove(&mm->regions[i + 1], &mm->regions[i], 554 | sizeof(mm->regions[0]) * (mm->num - i)); 555 | } 556 | mm->regions[i] = region; 557 | mm->num++; 558 | 559 | return 0; 560 | } 561 | 562 | int vhd_memmap_del_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva, 563 | size_t size) 564 | { 565 | unsigned i; 566 | 567 | for (i = 0; i < mm->num; i++) { 568 | struct vhd_memory_region *reg = mm->regions[i]; 569 | if (reg->gpa == gpa && reg->uva == uva && reg->size == size) { 570 | break; 571 | } 572 | } 573 | 574 | if (i == mm->num) { 575 | return -ENXIO; 576 | } 577 | 578 | region_unref(mm->regions[i]); 579 | 580 | mm->num--; 581 | if (i < mm->num) { 582 | memmove(&mm->regions[i], &mm->regions[i + 1], 583 | sizeof(mm->regions[0]) * (mm->num - i)); 584 | } 585 | 586 | return 0; 587 | } 588 | -------------------------------------------------------------------------------- /event.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Based on QEMU's util/async.c 3 | * 4 | * Copyright (c) 2003-2008 Fabrice Bellard 5 | * Copyright (c) 2009-2017 QEMU contributors 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in 15 | * all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | * THE SOFTWARE. 24 | */ 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "catomic.h" 33 | #include "queue.h" 34 | #include "platform.h" 35 | #include "event.h" 36 | #include "logging.h" 37 | 38 | enum { 39 | /* Already enqueued and waiting for bh_poll() */ 40 | BH_PENDING = (1 << 0), 41 | 42 | /* Invoke the callback */ 43 | BH_SCHEDULED = (1 << 1), 44 | 45 | /* Delete without invoking callback */ 46 | BH_DELETED = (1 << 2), 47 | 48 | /* Delete after invoking callback */ 49 | BH_ONESHOT = (1 << 3), 50 | }; 51 | 52 | struct vhd_bh { 53 | struct vhd_event_loop *ctx; 54 | vhd_bh_cb *cb; 55 | void *opaque; 56 | SLIST_ENTRY(vhd_bh) next; 57 | unsigned flags; 58 | }; 59 | 60 | typedef SLIST_HEAD(, vhd_bh) vhd_bh_list; 61 | 62 | struct vhd_event_loop { 63 | int epollfd; 64 | 65 | /* eventfd we use to cancel epoll_wait if needed */ 66 | int notifyfd; 67 | 68 | /* number of currently attached events (for consistency checks) */ 69 | uint32_t num_events_attached; 70 | 71 | bool notified; 72 | 73 | /* vhd_terminate_event_loop has been completed */ 74 | bool is_terminated; 75 | 76 | bool has_home_thread; 77 | 78 | /* preallocated events buffer */ 79 | struct epoll_event *events; 80 | uint64_t max_events; 81 | 82 | vhd_bh_list bh_list; 83 | 84 | SLIST_HEAD(, vhd_io_handler) deleted_handlers; 85 | }; 86 | 87 | static void evloop_notify(struct vhd_event_loop *evloop) 88 | { 89 | if (!catomic_xchg(&evloop->notified, true)) { 90 | vhd_set_eventfd(evloop->notifyfd); 91 | } 92 | } 93 | 94 | static void notify_accept(struct vhd_event_loop *evloop) 95 | { 96 | if (catomic_read(&evloop->notified)) { 97 | vhd_clear_eventfd(evloop->notifyfd); 98 | catomic_xchg(&evloop->notified, false); 99 | } 100 | } 101 | 102 | /* called concurrently from any thread */ 103 | static void bh_enqueue(struct vhd_bh *bh, unsigned new_flags) 104 | { 105 | struct vhd_event_loop *ctx = bh->ctx; 106 | unsigned old_flags; 107 | 108 | /* 109 | * The memory barrier implicit in catomic_fetch_or makes sure that: 110 | * 1. any writes needed by the callback are done before the locations are 111 | * read in the bh_poll. 112 | * 2. ctx is loaded before the callback has a chance to execute and bh 113 | * could be freed. 114 | * Paired with bh_dequeue(). 115 | */ 116 | old_flags = catomic_fetch_or(&bh->flags, BH_PENDING | new_flags); 117 | if (!(old_flags & BH_PENDING)) { 118 | SLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next); 119 | } 120 | 121 | evloop_notify(ctx); 122 | } 123 | 124 | /* only called from bh_poll() and bh_cleanup() */ 125 | static struct vhd_bh *bh_dequeue(vhd_bh_list *head, unsigned *flags) 126 | { 127 | struct vhd_bh *bh = SLIST_FIRST_RCU(head); 128 | 129 | if (!bh) { 130 | return NULL; 131 | } 132 | 133 | SLIST_REMOVE_HEAD(head, next); 134 | 135 | /* 136 | * The catomic_and is paired with bh_enqueue(). The implicit memory barrier 137 | * ensures that the callback sees all writes done by the scheduling thread. 138 | * It also ensures that the scheduling thread sees the cleared flag before 139 | * bh->cb has run, and thus will call evloop_notify again if necessary. 140 | */ 141 | *flags = catomic_fetch_and(&bh->flags, ~(BH_PENDING | BH_SCHEDULED)); 142 | return bh; 143 | } 144 | 145 | struct vhd_bh *vhd_bh_new(struct vhd_event_loop *ctx, 146 | vhd_bh_cb *cb, void *opaque) 147 | { 148 | struct vhd_bh *bh = vhd_alloc(sizeof(*bh)); 149 | *bh = (struct vhd_bh){ 150 | .ctx = ctx, 151 | .cb = cb, 152 | .opaque = opaque, 153 | }; 154 | return bh; 155 | } 156 | 157 | void vhd_bh_schedule_oneshot(struct vhd_event_loop *ctx, 158 | vhd_bh_cb *cb, void *opaque) 159 | { 160 | struct vhd_bh *bh = vhd_bh_new(ctx, cb, opaque); 161 | bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT); 162 | } 163 | 164 | void vhd_bh_schedule(struct vhd_bh *bh) 165 | { 166 | bh_enqueue(bh, BH_SCHEDULED); 167 | } 168 | 169 | /* this is async and doesn't interfere with already running bh */ 170 | void vhd_bh_cancel(struct vhd_bh *bh) 171 | { 172 | catomic_and(&bh->flags, ~BH_SCHEDULED); 173 | } 174 | 175 | /* this is async; deletion only happens in bh_poll, so need to enqueue first */ 176 | void vhd_bh_delete(struct vhd_bh *bh) 177 | { 178 | bh_enqueue(bh, BH_DELETED); 179 | } 180 | 181 | 182 | static void bh_call(struct vhd_bh *bh) 183 | { 184 | bh->cb(bh->opaque); 185 | } 186 | 187 | /* 188 | * Execute bottom halves scheduled so far. Return true if any progress has 189 | * been made (i.e. any bh was executed). 190 | * Multiple occurrences of bh_poll cannot be called concurrently. 191 | */ 192 | static bool bh_poll(struct vhd_event_loop *ctx) 193 | { 194 | vhd_bh_list bh_list; 195 | struct vhd_bh *bh; 196 | unsigned flags; 197 | bool ret = false; 198 | 199 | SLIST_INIT(&bh_list); 200 | /* swap bh list from ctx for a fresh one */ 201 | SLIST_MOVE_ATOMIC(&bh_list, &ctx->bh_list); 202 | 203 | for (;;) { 204 | bh = bh_dequeue(&bh_list, &flags); 205 | if (!bh) { 206 | break; 207 | } 208 | 209 | if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { 210 | ret = true; 211 | bh_call(bh); 212 | } 213 | 214 | if (flags & (BH_DELETED | BH_ONESHOT)) { 215 | vhd_free(bh); 216 | } 217 | } 218 | 219 | return ret; 220 | } 221 | 222 | static void bh_cleanup(struct vhd_event_loop *ctx) 223 | { 224 | struct vhd_bh *bh; 225 | unsigned flags; 226 | 227 | for (;;) { 228 | bh = bh_dequeue(&ctx->bh_list, &flags); 229 | if (!bh) { 230 | break; 231 | } 232 | 233 | /* only deleted bhs may remain */ 234 | assert(flags & BH_DELETED); 235 | vhd_free(bh); 236 | } 237 | } 238 | 239 | struct vhd_io_handler { 240 | struct vhd_event_loop *evloop; 241 | int (*read)(void *opaque); 242 | /* FIXME: must really include write handler as well */ 243 | void *opaque; 244 | int fd; 245 | 246 | bool attached; 247 | SLIST_ENTRY(vhd_io_handler) deleted_entry; 248 | }; 249 | 250 | static int handle_one_event(struct vhd_io_handler *handler, int event_code) 251 | { 252 | if ((event_code & (EPOLLIN | EPOLLERR | EPOLLRDHUP)) && handler->read) { 253 | return handler->read(handler->opaque); 254 | } 255 | 256 | return 0; 257 | } 258 | 259 | static int handle_events(struct vhd_event_loop *evloop, int nevents) 260 | { 261 | int nerr = 0; 262 | struct epoll_event *events = evloop->events; 263 | 264 | for (int i = 0; i < nevents; i++) { 265 | struct vhd_io_handler *handler = events[i].data.ptr; 266 | /* event loop notifer doesn't use a handler */ 267 | if (!handler) { 268 | continue; 269 | } 270 | /* don't call into detached handler even if it's on the ready list */ 271 | if (!handler->attached) { 272 | continue; 273 | } 274 | if (handle_one_event(handler, events[i].events)) { 275 | nerr++; 276 | } 277 | } 278 | 279 | /* 280 | * The deleted handlers are detached and won't appear on the ready list any 281 | * more, so it's now safe to actually delete them. 282 | */ 283 | while (!SLIST_EMPTY(&evloop->deleted_handlers)) { 284 | struct vhd_io_handler *handler = 285 | SLIST_FIRST(&evloop->deleted_handlers); 286 | SLIST_REMOVE_HEAD(&evloop->deleted_handlers, deleted_entry); 287 | vhd_free(handler); 288 | } 289 | 290 | return nerr; 291 | } 292 | 293 | struct vhd_event_loop *vhd_create_event_loop(size_t max_events) 294 | { 295 | int notifyfd; 296 | int epollfd; 297 | 298 | epollfd = epoll_create1(EPOLL_CLOEXEC); 299 | if (epollfd < 0) { 300 | VHD_LOG_ERROR("epoll_create1: %s", strerror(errno)); 301 | return NULL; 302 | } 303 | 304 | notifyfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 305 | if (notifyfd < 0) { 306 | VHD_LOG_ERROR("eventfd() failed: %s", strerror(errno)); 307 | goto close_epoll; 308 | } 309 | 310 | /* Register notify eventfd, make sure it is level-triggered */ 311 | struct epoll_event ev = { 312 | .events = EPOLLIN, 313 | }; 314 | if (epoll_ctl(epollfd, EPOLL_CTL_ADD, notifyfd, &ev) == -1) { 315 | VHD_LOG_ERROR("epoll_ctl(EPOLL_CTL_ADD, notifyfd): %s", 316 | strerror(errno)); 317 | goto error_out; 318 | } 319 | 320 | struct vhd_event_loop *evloop = vhd_alloc(sizeof(*evloop)); 321 | max_events++; /* +1 for notify eventfd */ 322 | *evloop = (struct vhd_event_loop) { 323 | .epollfd = epollfd, 324 | .notifyfd = notifyfd, 325 | .max_events = max_events, 326 | .events = vhd_calloc(sizeof(evloop->events[0]), max_events), 327 | }; 328 | SLIST_INIT(&evloop->bh_list); 329 | SLIST_INIT(&evloop->deleted_handlers); 330 | 331 | return evloop; 332 | 333 | error_out: 334 | close(notifyfd); 335 | close_epoll: 336 | close(epollfd); 337 | return NULL; 338 | } 339 | 340 | static __thread struct vhd_event_loop *home_evloop; 341 | 342 | int vhd_run_event_loop(struct vhd_event_loop *evloop, int timeout_ms) 343 | { 344 | if (!home_evloop) { 345 | bool had_home_thread = catomic_xchg(&evloop->has_home_thread, true); 346 | VHD_VERIFY(!had_home_thread); 347 | home_evloop = evloop; 348 | } 349 | VHD_ASSERT(evloop == home_evloop); 350 | 351 | if (evloop->is_terminated) { 352 | return 0; 353 | } 354 | 355 | int nev = epoll_wait(evloop->epollfd, evloop->events, evloop->max_events, 356 | timeout_ms); 357 | if (!nev) { 358 | return -EAGAIN; 359 | } else if (nev < 0) { 360 | int ret = -errno; 361 | if (ret == -EINTR) { 362 | return -EAGAIN; 363 | } 364 | 365 | VHD_LOG_ERROR("epoll_wait internal error: %s", strerror(-ret)); 366 | return ret; 367 | } 368 | 369 | notify_accept(evloop); 370 | bh_poll(evloop); 371 | 372 | int nerr = handle_events(evloop, nev); 373 | if (nerr) { 374 | VHD_LOG_WARN("Got %d events, can't handle %d events", nev, nerr); 375 | return -EIO; 376 | } 377 | 378 | return -EAGAIN; 379 | } 380 | 381 | static void evloop_stop_bh(void *opaque) 382 | { 383 | struct vhd_event_loop *evloop = opaque; 384 | evloop->is_terminated = true; 385 | } 386 | 387 | void vhd_terminate_event_loop(struct vhd_event_loop *evloop) 388 | { 389 | vhd_bh_schedule_oneshot(evloop, evloop_stop_bh, evloop); 390 | } 391 | 392 | /* 393 | * Only free the event loop when there's no concurrent access to it. One way 394 | * to do it is to do free at the end of the thread running the event loop. 395 | * Another is to wait for the thread running the event loop to terminate (to 396 | * join it) and only do free afterwards. 397 | */ 398 | void vhd_free_event_loop(struct vhd_event_loop *evloop) 399 | { 400 | VHD_ASSERT(evloop->is_terminated); 401 | VHD_ASSERT(evloop->num_events_attached == 0); 402 | bh_cleanup(evloop); 403 | close(evloop->epollfd); 404 | close(evloop->notifyfd); 405 | vhd_free(evloop->events); 406 | vhd_free(evloop); 407 | } 408 | 409 | static void event_loop_inc_events(struct vhd_event_loop *evloop) 410 | { 411 | evloop->num_events_attached++; 412 | } 413 | 414 | static void event_loop_dec_events(struct vhd_event_loop *evloop) 415 | { 416 | VHD_ASSERT(evloop->num_events_attached > 0); 417 | evloop->num_events_attached--; 418 | } 419 | 420 | int vhd_attach_io_handler(struct vhd_io_handler *handler) 421 | { 422 | struct vhd_event_loop *evloop = handler->evloop; 423 | int fd = handler->fd; 424 | 425 | struct epoll_event ev = { 426 | .events = EPOLLIN | EPOLLHUP | EPOLLRDHUP, 427 | .data.ptr = handler 428 | }; 429 | 430 | /* to maintain fields consistency only do this in the home event loop */ 431 | VHD_ASSERT(evloop == home_evloop); 432 | 433 | /* unlike detach, multiple attachment is a logic error */ 434 | VHD_ASSERT(!handler->attached); 435 | 436 | if (epoll_ctl(evloop->epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) { 437 | int ret = -errno; 438 | VHD_LOG_ERROR("Can't add event: %s", strerror(-ret)); 439 | return ret; 440 | } 441 | 442 | handler->attached = true; 443 | 444 | return 0; 445 | } 446 | 447 | struct vhd_io_handler *vhd_add_io_handler(struct vhd_event_loop *evloop, 448 | int fd, int (*read)(void *opaque), 449 | void *opaque) 450 | { 451 | struct vhd_io_handler *handler; 452 | 453 | handler = vhd_alloc(sizeof(*handler)); 454 | *handler = (struct vhd_io_handler) { 455 | .evloop = evloop, 456 | .fd = fd, 457 | .read = read, 458 | .opaque = opaque 459 | }; 460 | 461 | if (vhd_attach_io_handler(handler) < 0) { 462 | goto fail; 463 | } 464 | 465 | event_loop_inc_events(evloop); 466 | return handler; 467 | fail: 468 | vhd_free(handler); 469 | return NULL; 470 | } 471 | 472 | int vhd_detach_io_handler(struct vhd_io_handler *handler) 473 | { 474 | struct vhd_event_loop *evloop = handler->evloop; 475 | 476 | /* to maintain fields consistency only do this in the home event loop */ 477 | VHD_ASSERT(evloop == home_evloop); 478 | 479 | if (!handler->attached) { 480 | return 0; 481 | } 482 | 483 | if (epoll_ctl(evloop->epollfd, EPOLL_CTL_DEL, handler->fd, NULL) < 0) { 484 | int ret = -errno; 485 | VHD_LOG_ERROR("Can't delete event: %s", strerror(-ret)); 486 | return ret; 487 | } 488 | 489 | /* 490 | * The file descriptor being detached may still be sitting on the ready 491 | * list returned by epoll_wait. 492 | * Make sure the handler for it isn't called. 493 | */ 494 | handler->attached = false; 495 | 496 | return 0; 497 | } 498 | 499 | int vhd_del_io_handler(struct vhd_io_handler *handler) 500 | { 501 | int ret; 502 | struct vhd_event_loop *evloop = handler->evloop; 503 | 504 | ret = vhd_detach_io_handler(handler); 505 | if (ret < 0) { 506 | return ret; 507 | } 508 | 509 | /* 510 | * The file descriptor being deleted may still be sitting on the ready list 511 | * returned by epoll_wait. 512 | * Schedule it for deallocation at the end of the iteration after the ready 513 | * event list processing is through. 514 | */ 515 | SLIST_INSERT_HEAD(&evloop->deleted_handlers, handler, deleted_entry); 516 | 517 | event_loop_dec_events(evloop); 518 | return 0; 519 | } 520 | 521 | void vhd_clear_eventfd(int fd) 522 | { 523 | eventfd_t unused; 524 | while (eventfd_read(fd, &unused) && errno == EINTR) { 525 | ; 526 | } 527 | } 528 | 529 | void vhd_set_eventfd(int fd) 530 | { 531 | while (eventfd_write(fd, 1) && errno == EINTR) { 532 | ; 533 | } 534 | } 535 | 536 | struct vhd_work { 537 | void (*func)(struct vhd_work *, void *); 538 | void *opaque; 539 | int ret; 540 | sem_t wait; 541 | }; 542 | 543 | void vhd_complete_work(struct vhd_work *work, int ret) 544 | { 545 | work->ret = ret; 546 | /* 547 | * sem_post is a full memory barrier so the vhd_submit_work_and_wait will 548 | * see ->ret set above 549 | */ 550 | if (sem_post(&work->wait) < 0) { 551 | /* log an error and continue as there's no better strategy */ 552 | VHD_LOG_ERROR("sem_post: %s", strerror(errno)); 553 | } 554 | } 555 | 556 | static void work_bh(void *opaque) 557 | { 558 | struct vhd_work *work = opaque; 559 | work->func(work, work->opaque); 560 | } 561 | 562 | int vhd_submit_work_and_wait(struct vhd_event_loop *evloop, 563 | void (*func)(struct vhd_work *, void *), 564 | void *opaque) 565 | { 566 | int ret; 567 | struct vhd_work work = { 568 | .func = func, 569 | .opaque = opaque, 570 | }; 571 | 572 | /* waiting for completion in the same event loop would deadlock */ 573 | VHD_ASSERT(evloop != home_evloop); 574 | 575 | /* sem_init can't fail when both arguments are zero */ 576 | ret = sem_init(&work.wait, 0, 0); 577 | VHD_ASSERT(ret == 0); 578 | 579 | vhd_bh_schedule_oneshot(evloop, work_bh, &work); 580 | 581 | /* 582 | * sem_wait may fail with either EINTR (we handle it) or EINVAL when 583 | * called on invalid pointer, which is impossible here. 584 | */ 585 | do { 586 | ret = sem_wait(&work.wait); 587 | } while (ret < 0 && errno == EINTR); 588 | VHD_ASSERT(ret == 0); 589 | 590 | /* 591 | * sem_destroy may fail only with EINVAL when called on invalid pointer, 592 | * which is impossible here. 593 | */ 594 | ret = sem_destroy(&work.wait); 595 | VHD_ASSERT(ret == 0); 596 | 597 | /* 598 | * sem_wait is a full memory barrier so this is the ->ret set in 599 | * vhd_complete_work 600 | */ 601 | return work.ret; 602 | } 603 | -------------------------------------------------------------------------------- /virtio/virtio_blk.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "vhost/blockdev.h" 5 | 6 | #include "virtio_blk.h" 7 | #include "virtio_blk_spec.h" 8 | 9 | #include "bio.h" 10 | #include "virt_queue.h" 11 | #include "logging.h" 12 | #include "server_internal.h" 13 | #include "vdev.h" 14 | 15 | /* virtio blk data for bdev io */ 16 | struct virtio_blk_io { 17 | struct virtio_virtq *vq; 18 | struct virtio_iov *iov; 19 | 20 | struct vhd_io io; 21 | struct vhd_bdev_io bdev_io; 22 | }; 23 | 24 | static size_t iov_size(const struct vhd_buffer *iov, unsigned niov) 25 | { 26 | size_t len; 27 | unsigned int i; 28 | 29 | len = 0; 30 | for (i = 0; i < niov; i++) { 31 | len += iov[i].len; 32 | } 33 | return len; 34 | } 35 | 36 | static uint8_t translate_status(enum vhd_bdev_io_result status) 37 | { 38 | switch (status) { 39 | case VHD_BDEV_SUCCESS: 40 | return VIRTIO_BLK_S_OK; 41 | default: 42 | return VIRTIO_BLK_S_IOERR; 43 | } 44 | } 45 | 46 | static void set_status(struct virtio_iov *iov, uint8_t status) 47 | { 48 | struct vhd_buffer *last_iov = &iov->iov_in[iov->niov_in - 1]; 49 | *((uint8_t *)last_iov->base) = status; 50 | } 51 | 52 | static void complete_req(struct vhd_vdev *vdev, struct virtio_virtq *vq, 53 | struct virtio_iov *iov, uint8_t status) 54 | { 55 | size_t in_size; 56 | 57 | set_status(iov, status); 58 | /* 59 | * the last byte in the IN buffer is always written (for status), so pass 60 | * the total length of the IN buffer to virtq_push() 61 | */ 62 | in_size = iov_size(iov->iov_in, iov->niov_in); 63 | virtq_push(vq, iov, in_size); 64 | 65 | if (status == VHD_BDEV_SUCCESS && vdev != NULL && 66 | vdev->pte_flush_byte_threshold) { 67 | size_t out_size; 68 | 69 | out_size = iov_size(iov->iov_out, iov->niov_out); 70 | catomic_sub(&vdev->bytes_left_before_pte_flush, in_size + out_size); 71 | } 72 | 73 | virtio_free_iov(iov); 74 | } 75 | 76 | static void complete_io(struct vhd_io *io) 77 | { 78 | struct virtio_blk_io *bio = containerof(io, struct virtio_blk_io, io); 79 | 80 | if (likely(bio->io.status != VHD_BDEV_CANCELED)) { 81 | complete_req(io->vring->vdev, bio->vq, bio->iov, 82 | translate_status(bio->io.status)); 83 | } else { 84 | virtio_free_iov(bio->iov); 85 | } 86 | 87 | vhd_free(bio); 88 | } 89 | 90 | static bool is_valid_block_range_req(uint64_t sector, size_t nsectors, 91 | uint64_t capacity) 92 | { 93 | if (nsectors > capacity || sector > capacity - nsectors) { 94 | VHD_LOG_ERROR("Request (%" PRIu64 "s, +%zus) spans" 95 | " beyond device capacity %" PRIu64, 96 | sector, nsectors, capacity); 97 | return false; 98 | } 99 | 100 | return true; 101 | } 102 | 103 | static bool is_valid_req(uint64_t sector, size_t len, uint64_t capacity) 104 | { 105 | size_t nsectors = len / VIRTIO_BLK_SECTOR_SIZE; 106 | 107 | if (len == 0) { 108 | VHD_LOG_ERROR("Zero size request"); 109 | return false; 110 | } 111 | if (len % VIRTIO_BLK_SECTOR_SIZE) { 112 | VHD_LOG_ERROR("Request length %zu" 113 | " is not a multiple of sector size %u", 114 | len, VIRTIO_BLK_SECTOR_SIZE); 115 | return false; 116 | } 117 | 118 | return is_valid_block_range_req(sector, nsectors, capacity); 119 | } 120 | 121 | static bool bio_submit(struct virtio_blk_io *bio) 122 | { 123 | int res = virtio_blk_handle_request(bio->vq, &bio->io); 124 | if (res != 0) { 125 | VHD_LOG_ERROR("bdev request submission failed with %d", res); 126 | vhd_free(bio); 127 | return false; 128 | } 129 | 130 | return true; 131 | } 132 | 133 | static void handle_inout(struct virtio_blk_dev *dev, 134 | struct virtio_blk_req_hdr *req, 135 | struct virtio_virtq *vq, 136 | struct virtio_iov *iov) 137 | { 138 | size_t len; 139 | uint16_t ndatabufs; 140 | struct vhd_buffer *pdata; 141 | enum vhd_bdev_io_type io_type; 142 | 143 | if (req->type == VIRTIO_BLK_T_IN) { 144 | io_type = VHD_BDEV_READ; 145 | pdata = &iov->iov_in[0]; 146 | ndatabufs = iov->niov_in - 1; 147 | } else { 148 | if (virtio_blk_is_readonly(dev)) { 149 | VHD_LOG_ERROR("Write request to readonly device"); 150 | goto fail_request; 151 | } 152 | io_type = VHD_BDEV_WRITE; 153 | pdata = &iov->iov_out[1]; 154 | ndatabufs = iov->niov_out - 1; 155 | } 156 | 157 | len = iov_size(pdata, ndatabufs); 158 | 159 | if (!is_valid_req(req->sector, len, dev->config.capacity)) { 160 | goto fail_request; 161 | } 162 | 163 | struct virtio_blk_io *bio = vhd_zalloc(sizeof(*bio)); 164 | bio->vq = vq; 165 | bio->iov = iov; 166 | bio->io.completion_handler = complete_io; 167 | 168 | bio->bdev_io.type = io_type; 169 | /* 170 | * bdev_io fields must be in VHD_SECTOR_SIZE, but it's equal to 171 | * VIRTIO_SECTOR_SIZE, which is 512 bytes. We assert that below 172 | * in virtio_blk_init_dev() 173 | */ 174 | bio->bdev_io.first_sector = req->sector; 175 | bio->bdev_io.total_sectors = len / VIRTIO_BLK_SECTOR_SIZE; 176 | bio->bdev_io.sglist.nbuffers = ndatabufs; 177 | bio->bdev_io.sglist.buffers = pdata; 178 | 179 | if (!bio_submit(bio)) { 180 | goto fail_request; 181 | } 182 | 183 | /* request will be completed asynchronously */ 184 | return; 185 | 186 | fail_request: 187 | complete_req(NULL, vq, iov, VIRTIO_BLK_S_IOERR); 188 | } 189 | 190 | static void handle_discard_or_write_zeroes(struct virtio_blk_dev *dev, 191 | le32 type, 192 | struct virtio_virtq *vq, 193 | struct virtio_iov *iov) 194 | { 195 | struct virtio_blk_discard_write_zeroes seg; 196 | struct virtio_blk_io *bio; 197 | enum vhd_bdev_io_type io_type; 198 | le32 max_sectors; 199 | bool is_discard = type == VIRTIO_BLK_T_DISCARD; 200 | const char *type_str = is_discard ? "discard" : "write-zeroes"; 201 | VHD_ASSERT(is_discard || type == VIRTIO_BLK_T_WRITE_ZEROES); 202 | 203 | if (virtio_blk_is_readonly(dev)) { 204 | VHD_LOG_ERROR("%s request to readonly device", type_str); 205 | goto fail_request; 206 | } 207 | 208 | /* 209 | * The data used for discard, secure erase or write zeroes commands 210 | * consists of one or more segments. We support only one at the moment. 211 | */ 212 | if (iov->niov_out != 2) { 213 | VHD_LOG_ERROR("Invalid number of segments for a " 214 | "%s request %"PRIu16, 215 | type_str, iov->niov_out); 216 | goto fail_request; 217 | } 218 | 219 | if (iov->iov_out[1].len != sizeof(seg)) { 220 | VHD_LOG_ERROR("Invalid %s segment size: " 221 | "expected %zu, got %zu!", type_str, 222 | sizeof(seg), iov->iov_out[1].len); 223 | goto fail_request; 224 | } 225 | 226 | memcpy(&seg, iov->iov_out[1].base, sizeof(seg)); 227 | if (!is_valid_block_range_req(seg.sector, seg.num_sectors, 228 | dev->config.capacity)) { 229 | goto fail_request; 230 | } 231 | 232 | if (is_discard) { 233 | le32 alignment = dev->config.discard_sector_alignment; 234 | 235 | if (!VHD_IS_ALIGNED(seg.num_sectors, alignment)) { 236 | VHD_LOG_ERROR("Discard request sector count %"PRIu32 237 | " not aligned to %"PRIu32, 238 | seg.num_sectors, alignment); 239 | goto fail_request; 240 | } 241 | 242 | if (!VHD_IS_ALIGNED(seg.sector, alignment)) { 243 | VHD_LOG_ERROR("Discard request sector %"PRIu64 244 | " not aligned to %"PRIu32, 245 | seg.sector, alignment); 246 | goto fail_request; 247 | } 248 | 249 | io_type = VHD_BDEV_DISCARD; 250 | max_sectors = dev->config.max_discard_sectors; 251 | } else { 252 | io_type = VHD_BDEV_WRITE_ZEROES; 253 | max_sectors = dev->config.max_write_zeroes_sectors; 254 | } 255 | 256 | if (seg.num_sectors > max_sectors) { 257 | VHD_LOG_ERROR("%s request too large: " 258 | "%"PRIu32" (max is %"PRIu32")", 259 | type_str, seg.num_sectors, max_sectors); 260 | goto fail_request; 261 | } 262 | 263 | bio = vhd_zalloc(sizeof(*bio)); 264 | bio->vq = vq; 265 | bio->iov = iov; 266 | bio->io.completion_handler = complete_io; 267 | bio->bdev_io.type = io_type; 268 | /* 269 | * bdev_io fields must be in VHD_SECTOR_SIZE, but it's equal to 270 | * VIRTIO_SECTOR_SIZE, which is 512 bytes. We assert that below 271 | * in virtio_blk_init_dev() 272 | */ 273 | bio->bdev_io.first_sector = seg.sector; 274 | bio->bdev_io.total_sectors = seg.num_sectors; 275 | 276 | if (!bio_submit(bio)) { 277 | goto fail_request; 278 | } 279 | 280 | /* request will be completed asynchronously */ 281 | return; 282 | 283 | fail_request: 284 | complete_req(NULL, vq, iov, VIRTIO_BLK_S_IOERR); 285 | } 286 | 287 | static uint8_t handle_getid(struct virtio_blk_dev *dev, 288 | struct virtio_iov *iov) 289 | { 290 | if (iov->niov_in != 2) { 291 | VHD_LOG_ERROR("Bad number of IN segments %u in request", iov->niov_in); 292 | return VIRTIO_BLK_S_IOERR; 293 | } 294 | 295 | struct vhd_buffer *id_buf = &iov->iov_in[0]; 296 | 297 | if (id_buf->len != VIRTIO_BLK_DISKID_LENGTH) { 298 | VHD_LOG_ERROR("Bad id buffer (len %zu)", id_buf->len); 299 | return VIRTIO_BLK_S_IOERR; 300 | } 301 | 302 | /* 303 | * strncpy will not add a null-term if src length is >= desc->len, which is 304 | * what we need 305 | */ 306 | strncpy((char *) id_buf->base, dev->serial, id_buf->len); 307 | 308 | return VIRTIO_BLK_S_OK; 309 | } 310 | 311 | static bool dev_supports_req(struct virtio_blk_dev *dev, le32 type) 312 | { 313 | int feature; 314 | 315 | switch (type) { 316 | case VIRTIO_BLK_T_IN: 317 | case VIRTIO_BLK_T_OUT: 318 | case VIRTIO_BLK_T_GET_ID: 319 | return true; 320 | case VIRTIO_BLK_T_DISCARD: 321 | feature = VIRTIO_BLK_F_DISCARD; 322 | break; 323 | case VIRTIO_BLK_T_WRITE_ZEROES: 324 | feature = VIRTIO_BLK_F_WRITE_ZEROES; 325 | break; 326 | default: 327 | return false; 328 | } 329 | 330 | return virtio_blk_has_feature(dev, feature); 331 | } 332 | 333 | static void handle_buffers(void *arg, struct virtio_virtq *vq, 334 | struct virtio_iov *iov) 335 | { 336 | uint8_t status; 337 | struct virtio_blk_dev *dev = arg; 338 | struct virtio_blk_req_hdr *req; 339 | le32 type; 340 | 341 | /* 342 | * Assume legacy message framing without VIRTIO_F_ANY_LAYOUT: 343 | * - one 16-byte device-readable segment for header 344 | * - data segments 345 | * - one 1-byte device-writable segment for status 346 | * FIXME: get rid of this assumption and support VIRTIO_F_ANY_LAYOUT 347 | */ 348 | 349 | if (!iov->niov_in || iov->iov_in[iov->niov_in - 1].len != 1) { 350 | VHD_LOG_ERROR("No room for status response in the request"); 351 | abort_request(vq, iov); 352 | return; 353 | } 354 | 355 | if (!iov->niov_out || iov->iov_out[0].len != sizeof(*req)) { 356 | VHD_LOG_ERROR("Malformed request header"); 357 | abort_request(vq, iov); 358 | return; 359 | } 360 | 361 | req = iov->iov_out[0].base; 362 | type = req->type; 363 | 364 | if (!dev_supports_req(dev, type)) { 365 | VHD_LOG_WARN("Unknown or unsupported request type %"PRIu32, type); 366 | status = VIRTIO_BLK_S_UNSUPP; 367 | goto out; 368 | } 369 | 370 | switch (type) { 371 | case VIRTIO_BLK_T_IN: 372 | case VIRTIO_BLK_T_OUT: 373 | handle_inout(dev, req, vq, iov); 374 | return; /* async completion */ 375 | case VIRTIO_BLK_T_GET_ID: 376 | status = handle_getid(dev, iov); 377 | break; 378 | case VIRTIO_BLK_T_DISCARD: 379 | case VIRTIO_BLK_T_WRITE_ZEROES: 380 | handle_discard_or_write_zeroes(dev, type, vq, iov); 381 | return; /* async completion */ 382 | default: /* unreachable because of dev_supports_req() */ 383 | VHD_UNREACHABLE(); 384 | }; 385 | 386 | out: 387 | complete_req(NULL, vq, iov, status); 388 | } 389 | 390 | /*////////////////////////////////////////////////////////////////////////////*/ 391 | 392 | int virtio_blk_dispatch_requests(struct virtio_blk_dev *dev, 393 | struct virtio_virtq *vq) 394 | { 395 | return virtq_dequeue_many(vq, handle_buffers, dev); 396 | } 397 | 398 | __attribute__((weak)) 399 | int virtio_blk_handle_request(struct virtio_virtq *vq, struct vhd_io *io) 400 | { 401 | io->vring = VHD_VRING_FROM_VQ(vq); 402 | return vhd_enqueue_request(vhd_get_rq_for_vring(io->vring), io); 403 | } 404 | 405 | size_t virtio_blk_get_config(struct virtio_blk_dev *dev, void *cfgbuf, 406 | size_t bufsize, size_t offset) 407 | { 408 | if (offset >= sizeof(dev->config)) { 409 | return 0; 410 | } 411 | 412 | size_t data_size = MIN(bufsize, sizeof(dev->config) - offset); 413 | 414 | memcpy(cfgbuf, (char *)(&dev->config) + offset, data_size); 415 | 416 | return data_size; 417 | } 418 | 419 | uint64_t virtio_blk_get_features(struct virtio_blk_dev *dev) 420 | { 421 | return dev->features; 422 | } 423 | 424 | bool virtio_blk_has_feature(struct virtio_blk_dev *dev, int feature) 425 | { 426 | const uint64_t mask = 1ull << feature; 427 | return (virtio_blk_get_features(dev) & mask) == mask; 428 | } 429 | 430 | bool virtio_blk_is_readonly(struct virtio_blk_dev *dev) 431 | { 432 | return virtio_blk_has_feature(dev, VIRTIO_BLK_F_RO); 433 | } 434 | 435 | static void refresh_config_geometry(struct virtio_blk_config *config) 436 | { 437 | /* 438 | * Here we use same max values like we did for blockstor-plugin. 439 | * But it seems that the real world max values are: 440 | */ 441 | /* 63 for sectors */ 442 | const uint8_t max_sectors = 255; 443 | /* 16 for heads */ 444 | const uint8_t max_heads = 255; 445 | /* 16383 for cylinders */ 446 | const uint16_t max_cylinders = 65535; 447 | 448 | config->geometry.sectors = MIN(config->capacity, max_sectors); 449 | config->geometry.heads = 450 | MIN(1 + (config->capacity - 1) / max_sectors, max_heads); 451 | config->geometry.cylinders = 452 | MIN(1 + (config->capacity - 1) / (max_sectors * max_heads), 453 | max_cylinders); 454 | } 455 | 456 | uint64_t virtio_blk_get_total_blocks(struct virtio_blk_dev *dev) 457 | { 458 | return dev->config.capacity >> dev->config.topology.physical_block_exp; 459 | } 460 | 461 | void virtio_blk_set_total_blocks(struct virtio_blk_dev *dev, 462 | uint64_t total_blocks) 463 | { 464 | uint64_t new_capacity = 465 | total_blocks << dev->config.topology.physical_block_exp; 466 | 467 | if (new_capacity > dev->config.capacity) { 468 | VHD_LOG_INFO("virtio-blk resize: %" PRIu64 " -> %" PRIu64, 469 | dev->config.capacity, new_capacity); 470 | } else { 471 | VHD_LOG_WARN("virtio-blk resize not increasing: %" 472 | PRIu64 " -> %" PRIu64, 473 | dev->config.capacity, new_capacity); 474 | } 475 | 476 | dev->config.capacity = new_capacity; 477 | refresh_config_geometry(&dev->config); 478 | } 479 | 480 | void virtio_blk_init_dev( 481 | struct virtio_blk_dev *dev, 482 | const struct vhd_bdev_info *bdev) 483 | { 484 | uint32_t sector_size; 485 | 486 | dev->serial = vhd_strdup(bdev->serial); 487 | 488 | dev->features = VIRTIO_BLK_DEFAULT_FEATURES; 489 | if (vhd_blockdev_is_readonly(bdev)) { 490 | dev->features |= (1ull << VIRTIO_BLK_F_RO); 491 | } 492 | if (vhd_blockdev_has_discard(bdev)) { 493 | dev->features |= (1ull << VIRTIO_BLK_F_DISCARD); 494 | } 495 | if (vhd_blockdev_has_write_zeroes(bdev)) { 496 | dev->features |= (1ull << VIRTIO_BLK_F_WRITE_ZEROES); 497 | } 498 | 499 | /* 500 | * Both virtio and block backend use the same sector size of 512. Don't 501 | * bother converting between the two, just assert they are the same. 502 | */ 503 | VHD_STATIC_ASSERT(VHD_SECTOR_SIZE == VIRTIO_BLK_SECTOR_SIZE); 504 | 505 | /* capacity in 512 byte virtio sectors */ 506 | dev->config.capacity = 507 | (bdev->total_blocks * bdev->block_size) / VIRTIO_BLK_SECTOR_SIZE; 508 | 509 | sector_size = vhd_blockdev_sector_size(bdev); 510 | 511 | /* blk_size in bytes, aka "logical-block" */ 512 | dev->config.blk_size = sector_size; 513 | dev->config.numqueues = bdev->num_queues; 514 | 515 | /* # of logical blocks per physical block (log2) */ 516 | dev->config.topology.physical_block_exp = 517 | vhd_find_first_bit32(bdev->block_size / sector_size); 518 | 519 | dev->config.topology.alignment_offset = 0; 520 | /* TODO: can get that from bdev info */ 521 | dev->config.topology.min_io_size = 1; 522 | 523 | /* opt_io_size in blk_size chunks (logical blocks) */ 524 | dev->config.topology.opt_io_size = 525 | bdev->optimal_io_size / sector_size; 526 | 527 | /* discard_sector_alignment in 512-bytes virtio sectors */ 528 | dev->config.discard_sector_alignment = 529 | sector_size / VIRTIO_BLK_SECTOR_SIZE; 530 | 531 | dev->config.max_discard_sectors = VIRTIO_BLK_MAX_DISCARD_SECTORS; 532 | dev->config.max_discard_seg = VIRTIO_BLK_MAX_DISCARD_SEGMENTS; 533 | 534 | dev->config.max_write_zeroes_sectors = VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS; 535 | dev->config.max_write_zeroes_seg = VIRTIO_BLK_MAX_WRITE_ZEROES_SEGMENTS; 536 | /* 537 | * Since we don't know anything about the user of the library beforehand 538 | * assume we _may_ unmap the sectors on write-zeroes. 539 | * TODO: maybe propagate this value from blockdev config at creation time? 540 | */ 541 | dev->config.write_zeroes_may_unmap = 1; 542 | 543 | /* 544 | * Hardcode seg_max to 126. The same way like it's done for virtio-blk in 545 | * qemu 2.12 which is used by blockstor-plugin. 546 | * Although, this is an error prone approch which leads to the problems 547 | * when queue size != 128 548 | * (see https://www.mail-archive.com/qemu-devel@nongnu.org/msg668144.html) 549 | * we have to use it to provide migration compatibility between virtio-blk 550 | * and vhost-user-blk in both directions. 551 | */ 552 | dev->config.seg_max = 128 - 2; 553 | 554 | refresh_config_geometry(&dev->config); 555 | } 556 | 557 | void virtio_blk_destroy_dev(struct virtio_blk_dev *dev) 558 | { 559 | vhd_free(dev->serial); 560 | dev->serial = NULL; 561 | } 562 | 563 | struct vhd_bdev_io *vhd_get_bdev_io(struct vhd_io *io) 564 | { 565 | struct virtio_blk_io *bio = containerof(io, struct virtio_blk_io, io); 566 | return &bio->bdev_io; 567 | } 568 | -------------------------------------------------------------------------------- /virtio/virt_queue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "catomic.h" 10 | #include "virt_queue.h" 11 | #include "logging.h" 12 | #include "memmap.h" 13 | #include "memlog.h" 14 | 15 | /** 16 | * Holds private virtq data together with iovs we show users 17 | */ 18 | struct virtq_iov_private { 19 | /* Private virtq fields */ 20 | uint16_t used_head; 21 | struct vhd_memory_map *mm; 22 | 23 | /* Iov we show to caller */ 24 | struct virtio_iov iov; 25 | }; 26 | 27 | static inline uint16_t virtq_get_used_event(struct virtio_virtq *vq) 28 | { 29 | return vq->avail->ring[vq->qsz]; 30 | } 31 | 32 | static inline void virtq_set_avail_event(struct virtio_virtq *vq, 33 | uint16_t avail_idx) 34 | { 35 | *(le16 *)&vq->used->ring[vq->qsz] = avail_idx; 36 | } 37 | 38 | static int virtq_dequeue_one(struct virtio_virtq *vq, uint16_t head, 39 | virtq_handle_buffers_cb handle_buffers_cb, 40 | void *arg, bool resubmit); 41 | 42 | static struct virtq_iov_private *clone_iov(struct virtio_virtq *vq) 43 | { 44 | struct virtq_iov_private *priv; 45 | uint16_t niov = vq->niov_out + vq->niov_in; 46 | size_t iov_size = sizeof(struct vhd_buffer) * niov; 47 | 48 | priv = vhd_alloc(sizeof(*priv) + iov_size); 49 | memcpy(priv->iov.buffers, vq->buffers, iov_size); 50 | priv->iov.niov_out = vq->niov_out; 51 | priv->iov.iov_out = &priv->iov.buffers[0]; 52 | priv->iov.niov_in = vq->niov_in; 53 | priv->iov.iov_in = &priv->iov.buffers[vq->niov_out]; 54 | return priv; 55 | } 56 | 57 | void virtio_free_iov(struct virtio_iov *iov) 58 | { 59 | struct virtq_iov_private *priv = 60 | containerof(iov, struct virtq_iov_private, iov); 61 | 62 | /* matched with ref in virtq_dequeue_one */ 63 | vhd_memmap_unref(priv->mm); 64 | vhd_free(priv); 65 | } 66 | 67 | uint16_t virtio_iov_get_head(struct virtio_iov *iov) 68 | { 69 | struct virtq_iov_private *priv = 70 | containerof(iov, struct virtq_iov_private, iov); 71 | return priv->used_head; 72 | } 73 | 74 | static int add_buffer(struct virtio_virtq *vq, void *addr, size_t len, bool in) 75 | { 76 | uint16_t niov = vq->niov_out + vq->niov_in; 77 | 78 | if (niov >= vq->max_chain_len) { 79 | VHD_OBJ_ERROR(vq, "descriptor chain exceeds max length %u", 80 | vq->max_chain_len); 81 | return -ENOBUFS; 82 | } 83 | 84 | if (in) { 85 | vq->niov_in++; 86 | } else { 87 | /* 88 | * 2.6.4.2 Driver Requirements: Message Framing The driver MUST place 89 | * any device-writable descriptor elements after any device-readable 90 | * descriptor elements. 91 | */ 92 | if (vq->niov_in) { 93 | VHD_LOG_ERROR("Device-readable buffer after device-writable"); 94 | return -EINVAL; 95 | } 96 | vq->niov_out++; 97 | } 98 | 99 | vq->buffers[niov] = (struct vhd_buffer) { 100 | .base = addr, 101 | .len = len, 102 | .write_only = in, 103 | }; 104 | 105 | return 0; 106 | } 107 | 108 | static int map_buffer(struct virtio_virtq *vq, uint64_t gpa, size_t len, 109 | bool write_only) 110 | { 111 | void *addr = gpa_range_to_ptr(vq->mm, gpa, len); 112 | if (!addr) { 113 | VHD_OBJ_ERROR(vq, "Failed to map GPA 0x%" PRIx64 ", +0x%zx", gpa, len); 114 | return -EFAULT; 115 | } 116 | 117 | return add_buffer(vq, addr, len, write_only); 118 | } 119 | 120 | /* Modify inflight descriptor after dequeue request from the available ring. */ 121 | static void virtq_inflight_avail_update(struct virtio_virtq *vq, uint16_t head) 122 | { 123 | if (!vq->inflight_region) { 124 | return; 125 | } 126 | 127 | if (vq->inflight_region->desc[head].inflight) { 128 | VHD_OBJ_WARN(vq, "inflight[%u]=%u (expected 0)", head, 129 | vq->inflight_region->desc[head].inflight); 130 | } 131 | 132 | vq->inflight_region->desc[head].counter = vq->req_cnt; 133 | /* 134 | * Ensure the inflight region fields are updated in the expected order, so 135 | * that the next incarnation of the vhost backend can recover the state 136 | * regardless of where the current one dies. There's no concurrent access 137 | * to the inflight region so only a compiler barrier is necessary. 138 | */ 139 | barrier(); 140 | vq->inflight_region->desc[head].inflight = 1; 141 | vq->req_cnt++; 142 | } 143 | 144 | /* Prepare the inflight descriptor for commit. */ 145 | static void virtq_inflight_used_update(struct virtio_virtq *vq, uint16_t head) 146 | { 147 | if (!vq->inflight_region) { 148 | return; 149 | } 150 | 151 | vq->inflight_region->desc[head].next = vq->inflight_region->last_batch_head; 152 | /* 153 | * Ensure the inflight region fields are updated in the expected order, so 154 | * that the next incarnation of the vhost backend can recover the state 155 | * regardless of where the current one dies. There's no concurrent access 156 | * to the inflight region so only a compiler barrier is necessary. 157 | */ 158 | barrier(); 159 | vq->inflight_region->last_batch_head = head; 160 | } 161 | 162 | /* Post commit inflight descriptor handling. */ 163 | static void virtq_inflight_used_commit(struct virtio_virtq *vq, uint16_t head) 164 | { 165 | if (!vq->inflight_region) { 166 | return; 167 | } 168 | 169 | if (vq->inflight_region->desc[head].inflight != 1) { 170 | VHD_OBJ_WARN(vq, "inflight[%u]=%u (expected 1)", head, 171 | vq->inflight_region->desc[head].inflight); 172 | } 173 | 174 | vq->inflight_region->desc[head].inflight = 0; 175 | /* 176 | * Make sure used_idx is stored after the desc content, so that the next 177 | * incarnation of the vhost backend sees consistent values regardless of 178 | * where the current one dies. There's no concurrent access to the 179 | * inflight region so only a compiler barrier is necessary. 180 | */ 181 | barrier(); 182 | vq->inflight_region->used_idx = vq->used->idx; 183 | } 184 | 185 | /* 186 | * If the value of ``used_idx`` does not match the ``idx`` value of 187 | * used ring (means the inflight field of ``inflight_split_desc`` 188 | * entries in last batch may be incorrect). 189 | */ 190 | static void virtq_inflight_reconnect_update(struct virtio_virtq *vq) 191 | { 192 | uint16_t batch_size; 193 | uint16_t idx; 194 | 195 | vq->req_cnt = 0; 196 | if (!vq->inflight_region) { 197 | return; 198 | } 199 | 200 | /* Initialize the global req counter for the inflight descriptors. */ 201 | for (idx = 0; idx < vq->inflight_region->desc_num; idx++) { 202 | if (vq->inflight_region->desc[idx].counter > vq->req_cnt) { 203 | vq->req_cnt = vq->inflight_region->desc[idx].counter; 204 | } 205 | } 206 | 207 | /* fresh inflight region (not a reconnect) */ 208 | if (!vq->req_cnt) { 209 | goto out; 210 | } 211 | 212 | batch_size = vq->used->idx - vq->inflight_region->used_idx; 213 | if (!batch_size) { 214 | /* Last batch was sent successfully. Nothing to update. */ 215 | goto out; 216 | } 217 | 218 | /* we don't do batching for now */ 219 | VHD_ASSERT(batch_size == 1); 220 | 221 | idx = vq->inflight_region->last_batch_head; 222 | while (batch_size) { 223 | vq->inflight_region->desc[idx].inflight = 0; 224 | idx = vq->inflight_region->desc[idx].next; 225 | batch_size--; 226 | } 227 | 228 | out: 229 | vq->req_cnt++; 230 | vq->inflight_region->used_idx = vq->used->idx; 231 | } 232 | 233 | static void virtio_virtq_reset_stat(struct virtio_virtq *vq) 234 | { 235 | memset(&vq->stat, 0, sizeof(vq->stat)); 236 | } 237 | 238 | /* 239 | * Windows drivers violate the spec and create descriptor chains up to this 240 | * long, regardless of the queue size. 241 | */ 242 | #define WINDOWS_CHAIN_LEN_MAX (512 + 3) 243 | 244 | void virtio_virtq_init(struct virtio_virtq *vq) 245 | { 246 | VHD_ASSERT(!vq->buffers); 247 | 248 | vq->max_chain_len = MAX(vq->qsz, WINDOWS_CHAIN_LEN_MAX); 249 | 250 | vq->buffers = vhd_calloc(vq->max_chain_len, sizeof(vq->buffers[0])); 251 | 252 | /* Make check on the first virtq dequeue. */ 253 | vq->inflight_check = true; 254 | virtq_inflight_reconnect_update(vq); 255 | 256 | virtio_virtq_reset_stat(vq); 257 | } 258 | 259 | void virtio_virtq_release(struct virtio_virtq *vq) 260 | { 261 | VHD_ASSERT(vq->buffers); 262 | vhd_free(vq->buffers); 263 | *vq = (struct virtio_virtq) {}; 264 | } 265 | 266 | struct inflight_resubmit { 267 | uint64_t counter; 268 | uint16_t head; 269 | }; 270 | 271 | static int inflight_resubmit_compare(const void *first, const void *second) 272 | { 273 | struct inflight_resubmit *left = (struct inflight_resubmit *)first; 274 | struct inflight_resubmit *right = (struct inflight_resubmit *)second; 275 | 276 | if (left->counter < right->counter) { 277 | return -1; 278 | } 279 | /* Can't return 0, since counter values are always different. */ 280 | 281 | return 1; 282 | } 283 | 284 | /* Resubmit inflight requests on the virtqueue start. */ 285 | static int virtq_inflight_resubmit(struct virtio_virtq *vq, 286 | virtq_handle_buffers_cb handle_buffers_cb, 287 | void *arg) 288 | { 289 | uint16_t desc_num; 290 | uint16_t cnt; 291 | struct inflight_resubmit *resubmit_array; 292 | uint16_t i; 293 | int res; 294 | 295 | if (!vq->inflight_region) { 296 | return 0; 297 | } 298 | 299 | desc_num = vq->inflight_region->desc_num; 300 | cnt = 0; 301 | resubmit_array = alloca(sizeof(*resubmit_array) * desc_num); 302 | for (i = 0; i < desc_num; i++) { 303 | if (vq->inflight_region->desc[i].inflight) { 304 | resubmit_array[cnt].counter = vq->inflight_region->desc[i].counter; 305 | resubmit_array[cnt].head = i; 306 | cnt++; 307 | } 308 | } 309 | qsort(resubmit_array, cnt, sizeof(*resubmit_array), 310 | inflight_resubmit_compare); 311 | 312 | res = 0; 313 | VHD_OBJ_DEBUG(vq, "cnt = %d inflight requests should be resubmitted", cnt); 314 | for (i = 0; i < cnt; i++) { 315 | uint16_t head = resubmit_array[i].head; 316 | if (head >= vq->qsz) { 317 | VHD_OBJ_ERROR(vq, "resubmit desc %u: head %u past queue size %u", 318 | i, head, vq->qsz); 319 | return -ERANGE; 320 | } 321 | 322 | res = virtq_dequeue_one(vq, head, handle_buffers_cb, arg, true); 323 | if (res) { 324 | break; 325 | } 326 | } 327 | 328 | return res; 329 | } 330 | 331 | bool virtq_is_broken(struct virtio_virtq *vq) 332 | { 333 | return vq->broken; 334 | } 335 | 336 | void mark_broken(struct virtio_virtq *vq) 337 | { 338 | vq->broken = true; 339 | } 340 | 341 | #define DESCRIPTOR_ERROR(vq, idx, desc, fmt, ...) \ 342 | VHD_OBJ_ERROR(vq, "[%u]{0x%" PRIx64 ", +0x%x, 0x%x, %u}: " fmt, \ 343 | (idx), (desc)->addr, (desc)->len, \ 344 | (desc)->flags, (desc)->next, ##__VA_ARGS__) 345 | 346 | static int walk_indirect_table(struct virtio_virtq *vq, 347 | const struct virtq_desc *table_desc) 348 | { 349 | int res; 350 | struct virtq_desc desc; 351 | struct virtq_desc *desc_table; 352 | uint16_t table_len = table_desc->len / sizeof(desc); 353 | uint16_t idx; 354 | 355 | if (table_desc->len == 0 || table_desc->len % sizeof(desc)) { 356 | VHD_OBJ_ERROR(vq, "Bad indirect descriptor table length %u", 357 | table_desc->len); 358 | return -EINVAL; 359 | } 360 | 361 | desc_table = gpa_range_to_ptr(vq->mm, table_desc->addr, table_desc->len); 362 | if (!desc_table) { 363 | VHD_OBJ_ERROR(vq, "Failed to map indirect descriptor table " 364 | "GPA 0x%" PRIx64 ", +0x%x", 365 | table_desc->addr, table_desc->len); 366 | return -EFAULT; 367 | } 368 | 369 | for (idx = 0; ; idx = desc.next) { 370 | desc = desc_table[idx]; 371 | 372 | if (desc.flags & VIRTQ_DESC_F_INDIRECT) { 373 | DESCRIPTOR_ERROR(vq, idx, &desc, "nested indirect descriptor"); 374 | return -EMLINK; 375 | } 376 | 377 | res = map_buffer(vq, desc.addr, desc.len, 378 | desc.flags & VIRTQ_DESC_F_WRITE); 379 | if (res != 0) { 380 | DESCRIPTOR_ERROR(vq, idx, &desc, 381 | "failed to map descriptor in indirect table"); 382 | return res; 383 | } 384 | 385 | if (!(desc.flags & VIRTQ_DESC_F_NEXT)) { 386 | break; 387 | } 388 | 389 | if (desc.next >= table_len) { 390 | DESCRIPTOR_ERROR(vq, idx, &desc, 391 | "next points past indirect table size %u", 392 | table_len); 393 | return -ERANGE; 394 | } 395 | } 396 | 397 | return 0; 398 | } 399 | 400 | /* 401 | * Traverse a descriptor chain starting at @head, mapping the descriptors found 402 | * and pushing them onto @vq->buffers. 403 | * Return the number of descriptors consumed, or -errno. 404 | */ 405 | static int walk_chain(struct virtio_virtq *vq, uint16_t head) 406 | { 407 | uint16_t idx; 408 | uint16_t chain_len; 409 | struct virtq_desc desc; 410 | int res; 411 | 412 | vq->niov_out = vq->niov_in = 0; 413 | 414 | for (idx = head, chain_len = 1; ; idx = desc.next, chain_len++) { 415 | desc = vq->desc[idx]; 416 | 417 | if (desc.flags & VIRTQ_DESC_F_INDIRECT) { 418 | if (desc.flags & VIRTQ_DESC_F_NEXT) { 419 | DESCRIPTOR_ERROR(vq, idx, &desc, 420 | "indirect descriptor must have no next"); 421 | return -EINVAL; 422 | } 423 | 424 | res = walk_indirect_table(vq, &desc); 425 | if (res != 0) { 426 | DESCRIPTOR_ERROR(vq, idx, &desc, 427 | "failed to walk indirect descriptor table"); 428 | return res; 429 | } 430 | 431 | break; 432 | } 433 | 434 | res = map_buffer(vq, desc.addr, desc.len, 435 | desc.flags & VIRTQ_DESC_F_WRITE); 436 | if (res != 0) { 437 | DESCRIPTOR_ERROR(vq, idx, &desc, "failed to map"); 438 | return res; 439 | } 440 | 441 | if (!(desc.flags & VIRTQ_DESC_F_NEXT)) { 442 | break; 443 | } 444 | 445 | if (desc.next >= vq->qsz) { 446 | DESCRIPTOR_ERROR(vq, idx, &desc, 447 | "next points past queue size %u", vq->qsz); 448 | return -ERANGE; 449 | } 450 | } 451 | 452 | return chain_len; 453 | } 454 | 455 | int virtq_dequeue_many(struct virtio_virtq *vq, 456 | virtq_handle_buffers_cb handle_buffers_cb, 457 | void *arg) 458 | { 459 | int res; 460 | uint16_t i; 461 | uint16_t num_avail; 462 | uint16_t avail, avail2; 463 | time_t now; 464 | 465 | if (virtq_is_broken(vq)) { 466 | VHD_OBJ_ERROR(vq, "virtqueue is broken, cannot process"); 467 | return -ENODEV; 468 | } 469 | 470 | if (vq->inflight_check) { 471 | /* Check for the inflight requests once at the start. */ 472 | VHD_OBJ_DEBUG(vq, "resubmit inflight requests, if any"); 473 | res = virtq_inflight_resubmit(vq, handle_buffers_cb, arg); 474 | if (res) { 475 | goto queue_broken; 476 | } 477 | vq->inflight_check = false; 478 | } 479 | 480 | now = time(NULL); 481 | 482 | if (now - vq->stat.period_start_ts > 60) { 483 | vq->stat.period_start_ts = now; 484 | vq->stat.metrics.queue_len_max_60s = 0; 485 | } 486 | 487 | vq->stat.metrics.dispatch_total++; 488 | 489 | avail = vq->avail->idx; 490 | if (vq->has_event_idx) { 491 | smp_mb(); /* avail->idx read followed by avail_event write */ 492 | while (true) { 493 | virtq_set_avail_event(vq, avail); 494 | smp_mb(); /* avail_event write followed by avail->idx read */ 495 | avail2 = vq->avail->idx; 496 | if (avail2 == avail) { 497 | break; 498 | } 499 | smp_mb(); /* avail->idx read followed by avail_event write */ 500 | avail = avail2; 501 | } 502 | } 503 | 504 | num_avail = avail - vq->last_avail; 505 | if (num_avail > vq->qsz) { 506 | VHD_OBJ_ERROR(vq, "num_avail %u (%u - %u) exceeds queue size %u", 507 | num_avail, avail, vq->last_avail, vq->qsz); 508 | return -EOVERFLOW; 509 | } 510 | 511 | if (!num_avail) { 512 | vq->stat.metrics.dispatch_empty++; 513 | return 0; 514 | } 515 | 516 | vq->stat.metrics.queue_len_last = num_avail; 517 | if (vq->stat.metrics.queue_len_last > vq->stat.metrics.queue_len_max_60s) { 518 | vq->stat.metrics.queue_len_max_60s = vq->stat.metrics.queue_len_last; 519 | } 520 | 521 | /* Make sure that further desc reads do not pass avail->idx read. */ 522 | smp_rmb(); /* barrier pair [A] */ 523 | 524 | /* TODO: disable extra notifies from this point */ 525 | 526 | for (i = 0; i < num_avail; ++i) { 527 | /* Grab next descriptor head */ 528 | uint16_t head = vq->avail->ring[vq->last_avail % vq->qsz]; 529 | if (head >= vq->qsz) { 530 | VHD_OBJ_ERROR(vq, "avail %u: head %u past queue size %u", 531 | vq->last_avail, head, vq->qsz); 532 | return -ERANGE; 533 | } 534 | 535 | res = virtq_dequeue_one(vq, head, handle_buffers_cb, arg, false); 536 | if (res) { 537 | goto queue_broken; 538 | } 539 | 540 | vq->stat.metrics.request_total++; 541 | } 542 | 543 | /* TODO: restore notifier mask here */ 544 | return 0; 545 | 546 | queue_broken: 547 | mark_broken(vq); 548 | return res; 549 | } 550 | 551 | static int virtq_dequeue_one(struct virtio_virtq *vq, uint16_t head, 552 | virtq_handle_buffers_cb handle_buffers_cb, 553 | void *arg, bool resubmit) 554 | { 555 | int ret; 556 | 557 | ret = walk_chain(vq, head); 558 | if (ret < 0) { 559 | return ret; 560 | } 561 | 562 | /* Create iov copy from stored buffer for client handling */ 563 | struct virtq_iov_private *priv = clone_iov(vq); 564 | priv->used_head = head; 565 | priv->mm = vq->mm; 566 | /* matched with unref in virtio_free_iov */ 567 | vhd_memmap_ref(priv->mm); 568 | 569 | if (!resubmit) { 570 | virtq_inflight_avail_update(vq, head); 571 | } 572 | 573 | /* Send this over to handler */ 574 | handle_buffers_cb(arg, vq, &priv->iov); 575 | 576 | vq->last_avail++; 577 | 578 | return 0; 579 | } 580 | 581 | static void vhd_log_buffers(struct vhd_memory_log *log, 582 | struct vhd_memory_map *mm, 583 | struct virtio_iov *viov) 584 | { 585 | uint16_t i; 586 | for (i = 0; i < viov->niov_in; ++i) { 587 | struct vhd_buffer *iov = &viov->iov_in[i]; 588 | vhd_mark_range_dirty(log, mm, iov->base, iov->len); 589 | } 590 | } 591 | 592 | /* 593 | * NOTE: this @mm is the one the request was started with, not the current one 594 | * on @vq 595 | */ 596 | static void vhd_log_modified(struct virtio_virtq *vq, 597 | struct vhd_memory_map *mm, 598 | struct virtio_iov *iov, 599 | uint16_t used_idx) 600 | { 601 | /* log modifications of buffers in descr */ 602 | vhd_log_buffers(vq->log, mm, iov); 603 | if (vq->flags & VHOST_VRING_F_LOG) { 604 | /* log modification of used->idx */ 605 | vhd_mark_gpa_range_dirty(vq->log, 606 | vq->used_gpa_base + 607 | offsetof(struct virtq_used, idx), 608 | sizeof(vq->used->idx)); 609 | /* log modification of used->ring[idx] */ 610 | vhd_mark_gpa_range_dirty(vq->log, 611 | vq->used_gpa_base + 612 | offsetof(struct virtq_used, ring[used_idx]), 613 | sizeof(vq->used->ring[0])); 614 | } 615 | } 616 | 617 | static void virtq_do_notify(struct virtio_virtq *vq) 618 | { 619 | if (vq->notify_fd != -1) { 620 | eventfd_write(vq->notify_fd, 1); 621 | } 622 | } 623 | 624 | static bool virtq_need_notify(struct virtio_virtq *vq) 625 | { 626 | if (!vq->has_event_idx) { 627 | /* 628 | * Virtio specification v1.0, 5.1.6.2.3: 629 | * Often a driver will suppress transmission interrupts using the 630 | * VIRTQ_AVAIL_F_NO_INTERRUPT flag (see 3.2.2 Receiving Used Buffers 631 | * From The Device) and check for used packets in the transmit path 632 | * of following packets. 633 | */ 634 | return !(vq->avail->flags & VIRTQ_AVAIL_F_NO_INTERRUPT); 635 | } 636 | 637 | /* 638 | * Virtio specification v1.0, 2.4.7.2: 639 | * if the VIRTIO_F_EVENT_IDX feature bit is negotiated: 640 | * The device MUST ignore the lower bit of flags. 641 | * If the idx field in the used ring was 642 | * equal to used_event, the device MUST send an interrupt. 643 | * -------------------------------------------------------- 644 | * Note: code below assumes that virtq_notify is always called 645 | * per one completion, and never per batch. 646 | */ 647 | return virtq_get_used_event(vq) == (uint16_t)(vq->used->idx - 1); 648 | } 649 | 650 | static void virtq_notify(struct virtio_virtq *vq) 651 | { 652 | /* expose used ring entries before checking used event */ 653 | smp_mb(); 654 | 655 | if (virtq_need_notify(vq)) { 656 | virtq_do_notify(vq); 657 | } 658 | } 659 | 660 | void virtq_push(struct virtio_virtq *vq, struct virtio_iov *iov, uint32_t len) 661 | { 662 | /* Put buffer head index and len into used ring */ 663 | struct virtq_iov_private *priv = containerof(iov, struct virtq_iov_private, 664 | iov); 665 | uint16_t used_idx = vq->used->idx % vq->qsz; 666 | struct virtq_used_elem *used = &vq->used->ring[used_idx]; 667 | used->id = priv->used_head; 668 | used->len = len; 669 | 670 | virtq_inflight_used_update(vq, used->id); 671 | 672 | smp_wmb(); /* barrier pair [A] */ 673 | vq->used->idx++; 674 | 675 | virtq_inflight_used_commit(vq, used->id); 676 | VHD_OBJ_DEBUG(vq, "head = %d", priv->used_head); 677 | 678 | /* use memmap the request was started with rather than the current one */ 679 | if (vq->log) { 680 | vhd_log_modified(vq, priv->mm, &priv->iov, used_idx); 681 | } 682 | 683 | virtq_notify(vq); 684 | vq->stat.metrics.request_completed++; 685 | } 686 | 687 | void virtq_set_notify_fd(struct virtio_virtq *vq, int fd) 688 | { 689 | vq->notify_fd = fd; 690 | 691 | /* 692 | * Always notify new fd because on initial setup QEMU sets up kick_fd 693 | * before call_fd, so before call_fd becomes configured there can be 694 | * already processed descriptors that guest wasn't notified about. 695 | * And on reconnect connection may have been lost before the server has 696 | * had a chance to signal guest. 697 | */ 698 | virtq_do_notify(vq); 699 | } 700 | 701 | void virtio_virtq_get_stat(struct virtio_virtq *vq, 702 | struct vhd_vq_metrics *metrics) 703 | { 704 | *metrics = vq->stat.metrics; 705 | } 706 | 707 | __attribute__((weak)) 708 | void abort_request(struct virtio_virtq *vq, struct virtio_iov *iov) 709 | { 710 | /* 711 | * FIXME: this is called when the message framing is messed up. This 712 | * appears severe enough to just stop processing the virtq and mark it 713 | * broken 714 | */ 715 | VHD_LOG_ERROR("no valid virtio request found, queue %p should be aborted", vq); 716 | virtq_push(vq, iov, 0); 717 | virtio_free_iov(iov); 718 | } 719 | --------------------------------------------------------------------------------