├── .gitignore
├── logging.c
├── tests
    ├── .gitignore
    ├── test_utils.h
    ├── meson.build
    └── test_libvhost.py
├── subprojects
    └── libblkio.wrap
├── meson_options.txt
├── .exrc
├── platform.c
├── ya.make
├── virtio
    ├── virtio_types.h
    ├── virtio_fs.h
    ├── virtio_fs_spec.h
    ├── virtio_spec.h
    ├── virtio_blk.h
    ├── virt_queue.h
    ├── virtio_fs.c
    ├── virtio_blk_spec.h
    ├── virtio_blk.c
    └── virt_queue.c
├── memlog.h
├── LICENSE
├── bio.h
├── README.md
├── memmap.h
├── server_internal.h
├── CMakeLists.txt
├── logging.h
├── meson.build
├── queue.h
├── include
    └── vhost
    │   ├── fs.h
    │   ├── types.h
    │   ├── server.h
    │   └── blockdev.h
├── .github
    └── workflows
    │   └── main.yaml
├── objref.h
├── memlog.c
├── catomic.h
├── fs.c
├── event.h
├── blockdev.c
├── vhost_spec.h
├── platform.h
├── vdev.h
├── docs
    ├── logo.svg
    └── architecture.md
├── server.c
├── memmap.c
└── event.c


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | subprojects/libblkio/
3 | 


--------------------------------------------------------------------------------
/logging.c:
--------------------------------------------------------------------------------
1 | #include "logging.h"
2 | 
3 | log_function g_log_fn;
4 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | work/
2 | libblkio/
3 | .pytest_cache/
4 | __pycache__/


--------------------------------------------------------------------------------
/subprojects/libblkio.wrap:
--------------------------------------------------------------------------------
1 | [wrap-git]
2 | url = https://gitlab.com/libblkio/libblkio.git/
3 | revision = f1eabd1b
4 | 


--------------------------------------------------------------------------------
/meson_options.txt:
--------------------------------------------------------------------------------
1 | option('libblkio', type : 'feature', value : 'auto', description : 'Pull libblkio subproject (required for tests)')
2 | 


--------------------------------------------------------------------------------
/.exrc:
--------------------------------------------------------------------------------
1 | "VIM settings to match QEMU coding style.  They are activated by adding the
2 | "following settings (without the " symbol) as last two lines in $HOME/.vimrc:
3 | "set secure
4 | "set exrc
5 | set expandtab
6 | set shiftwidth=4
7 | set smarttab
8 | 


--------------------------------------------------------------------------------
/platform.c:
--------------------------------------------------------------------------------
 1 | #include <errno.h>
 2 | #include <unistd.h>
 3 | 
 4 | #include "platform.h"
 5 | 
 6 | int init_platform_page_size(void)
 7 | {
 8 |     if (!platform_page_size) {
 9 |         long result = sysconf(_SC_PAGESIZE);
10 |         if (result < 0) {
11 |             return errno;
12 |         }
13 |         platform_page_size = result;
14 |     }
15 | 
16 |     return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/ya.make:
--------------------------------------------------------------------------------
 1 | LIBRARY(vhost-server)
 2 | 
 3 | CFLAGS(
 4 |     -Wno-unused-parameter
 5 | )
 6 | 
 7 | SRCS(
 8 |     blockdev.c
 9 |     event.c
10 |     fs.c
11 |     logging.c
12 |     memlog.c
13 |     memmap.c
14 |     server.c
15 |     vdev.c
16 |     platform.c
17 |     virtio/virt_queue.c
18 |     virtio/virtio_blk.c
19 |     virtio/virtio_fs.c
20 | )
21 | 
22 | ADDINCL(
23 |     GLOBAL cloud/contrib/vhost/include
24 |     cloud/contrib/vhost
25 | )
26 | 
27 | END()
28 | 
29 | 


--------------------------------------------------------------------------------
/virtio/virtio_types.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Definitions from virtio spec version 1.0
 3 |  * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html.
 4 |  *
 5 |  * Type naming and style is preserved verbatim from virtio spec.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | typedef uint8_t  u8;
15 | 
16 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
17 | typedef uint16_t le16;
18 | typedef uint32_t le32;
19 | typedef uint64_t le64;
20 | #else
21 | #   error Implement me
22 | #endif
23 | 
24 | #ifdef __cplusplus
25 | }
26 | #endif
27 | 


--------------------------------------------------------------------------------
/memlog.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | 
 5 | #include "memmap.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | struct vhd_memory_log;
12 | 
13 | struct vhd_memory_log *vhd_memlog_new(size_t size, int fd, off_t offset);
14 | void vhd_memlog_free(struct vhd_memory_log *log);
15 | 
16 | void vhd_mark_range_dirty(struct vhd_memory_log *log,
17 |                           struct vhd_memory_map *mm, void *ptr, size_t len);
18 | void vhd_mark_gpa_range_dirty(struct vhd_memory_log *log, uint64_t gpa,
19 |                               size_t len);
20 | 
21 | #ifdef __cplusplus
22 | }
23 | #endif
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2024 YANDEX LLC
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 | 


--------------------------------------------------------------------------------
/bio.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Internally used represenation of a block io request passed to and returned
 3 |  * from the block backend.
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "queue.h"
 9 | #include "vhost/server.h"
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | struct vhd_vring;
16 | 
17 | struct vhd_io {
18 |     enum vhd_bdev_io_result status;
19 |     struct vhd_vring *vring;
20 | 
21 |     void (*completion_handler)(struct vhd_io *io);
22 | 
23 |     TAILQ_ENTRY(vhd_io) submission_link;
24 |     TAILQ_ENTRY(vhd_io) inflight_link;
25 |     SLIST_ENTRY(vhd_io) completion_link;
26 | 
27 |     time_t ts;
28 | };
29 | 
30 | #ifdef __cplusplus
31 | }
32 | #endif
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div style="display: flex; align-items: center;">
 2 |   <img style="margin-bottom: 5px;" src="docs/logo.svg" width="40px" align="left"/>
 3 |   <h1 style="margin-left: 5px; margin-bottom: 5px;">Libvhost</h1>
 4 | </div>
 5 | 
 6 | [![CI](https://github.com/yandex-cloud/yc-libvhost-server/actions/workflows/main.yaml/badge.svg)](https://github.com/yandex-cloud/yc-libvhost-server/actions/workflows/main.yaml)
 7 | 
 8 | A library for building [vhost-user protocol](https://qemu-project.gitlab.io/qemu/interop/vhost-user.html) servers.
 9 | 
10 | ## Quickstart
11 | 
12 | Building the project:
13 | ```bash
14 | CC=clang meson setup build
15 | ninja -C build
16 | ```
17 | 
18 | Running tests locally:
19 | ```
20 | ninja test -C build
21 | ```
22 | 


--------------------------------------------------------------------------------
/virtio/virtio_fs.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "virtio_fs_spec.h"
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | struct virtio_fs_dev;
10 | struct virtio_virtq;
11 | 
12 | struct vhd_fsdev_info;
13 | struct vhd_bio;
14 | struct vhd_guest_memory_map;
15 | 
16 | #define VIRTIO_FS_DEFAULT_FEATURES ((uint64_t)( \
17 |     (1UL << VIRTIO_F_RING_INDIRECT_DESC) | \
18 |     (1UL << VIRTIO_F_VERSION_1)))
19 | 
20 | /**
21 |  * Virtio file system device context
22 |  */
23 | struct virtio_fs_dev {
24 |     struct vhd_fsdev_info *fsdev;
25 | 
26 |     /* fs config data generated on init from fsdev */
27 |     struct virtio_fs_config config;
28 | };
29 | 
30 | /**
31 |  * Init virtio fs device context from fsdev info
32 |  */
33 | int virtio_fs_init_dev(
34 |     struct virtio_fs_dev *dev,
35 |     struct vhd_fsdev_info *fsdev);
36 | 
37 | /**
38 |  * Dispatch requests from device virtq
39 |  */
40 | int virtio_fs_dispatch_requests(struct virtio_fs_dev *dev,
41 |                                 struct virtio_virtq *vq);
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 


--------------------------------------------------------------------------------
/tests/test_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdarg.h>
 4 | #include <stdio.h>
 5 | #include <sys/time.h>
 6 | #include <time.h>
 7 | #include "vhost/server.h"
 8 | 
 9 | /* Normally we pass LOG_VERBOSITY from make */
10 | #ifndef LOG_VERBOSITY
11 | #   define LOG_VERBOSITY LOG_INFO
12 | #endif
13 | 
14 | /* Log function for tests */
15 | static const char *const log_level_str[] = {
16 |     "ERROR",
17 |     "WARNING",
18 |     "INFO",
19 |     "DEBUG",
20 | };
21 | 
22 | __attribute__((format(printf, 2, 3)))
23 | static inline void vhd_log_stderr(enum LogLevel level, const char *fmt, ...)
24 | {
25 |     va_list args;
26 |     va_start(args, fmt);
27 |     if (level <= LOG_VERBOSITY) {
28 |         char timestr[64];
29 |         struct timeval tv;
30 | 
31 |         gettimeofday(&tv, NULL);
32 |         strftime(timestr, sizeof(timestr), "%F %T", localtime(&tv.tv_sec));
33 |         fprintf(stderr, "%s.%03ld [%8s] ", timestr, tv.tv_usec / 1000,
34 |                 log_level_str[level]);
35 |         vfprintf(stderr, fmt, args);
36 |         fprintf(stderr, "\n");
37 |     }
38 |     va_end(args);
39 | }
40 | 


--------------------------------------------------------------------------------
/memmap.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdint.h>
 5 | #include <stdbool.h>
 6 | #include <sys/mman.h>
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | struct vhd_memory_map;
13 | 
14 | struct vhd_memory_map *vhd_memmap_new(int (*map_cb)(void *, size_t),
15 |                                       int (*unmap_cb)(void *, size_t));
16 | struct vhd_memory_map *vhd_memmap_dup(struct vhd_memory_map *mm);
17 | struct vhd_memory_map *vhd_memmap_dup_remap(struct vhd_memory_map *mm);
18 | 
19 | size_t vhd_memmap_max_memslots(void);
20 | 
21 | int vhd_memmap_add_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva,
22 |                         size_t size, int fd, off_t offset, bool preserve_fd);
23 | int vhd_memmap_del_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva,
24 |                         size_t size);
25 | 
26 | void vhd_memmap_ref(struct vhd_memory_map *mm);
27 | void vhd_memmap_unref(struct vhd_memory_map *mm);
28 | 
29 | void *gpa_range_to_ptr(struct vhd_memory_map *mm, uint64_t gpa, size_t len);
30 | void *uva_to_ptr(struct vhd_memory_map *mm, uint64_t uva);
31 | #define TRANSLATION_FAILED ((uint64_t)-1)
32 | uint64_t ptr_to_gpa(struct vhd_memory_map *mm, void *ptr);
33 | 
34 | #ifdef __cplusplus
35 | }
36 | #endif
37 | 


--------------------------------------------------------------------------------
/virtio/virtio_fs_spec.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * virtio-fs protocol definitions
 3 |  */
 4 | 
 5 | #pragma once
 6 | 
 7 | #include "virtio_spec.h"
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | /*
14 |  * Device configuration layout.
15 |  */
16 | struct VHD_PACKED virtio_fs_config {
17 |     /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */
18 |     u8 tag[36];
19 | 
20 |     /* Number of request queues exposed by the device. */
21 |     le32 num_request_queues;
22 | };
23 | 
24 | /*
25 |  * Generic FUSE request in/out headers.
26 |  * FIXME: these are duplicates of fuse_in_header/fuse_out_header, and should be
27 |  * removed in favor of the latter.
28 |  */
29 | struct virtio_fs_in_header {
30 |     le32 len;
31 |     le32 opcode;
32 |     le64 unique;
33 |     le64 nodeid;
34 |     le32 uid;
35 |     le32 gid;
36 |     le32 pid;
37 |     le32 padding;
38 | };
39 | 
40 | struct virtio_fs_out_header {
41 |     le32 len;
42 |     le32 error;
43 |     le64 unique;
44 | };
45 | 
46 | /*
47 |  * Device operation request.
48 |  *
49 |  * Request is a variable sized structure:
50 |  * struct virtio_fs_req {
51 |  *     // Device-readable part
52 |  *     struct virtio_fs_in_header in;
53 |  *     u8 datain[];
54 |  *
55 |  *     // Device-writable part
56 |  *     struct virtio_fs_out_header out;
57 |  *     u8 dataout[];
58 |  * };
59 |  */
60 | 
61 | #ifdef __cplusplus
62 | }
63 | #endif
64 | 


--------------------------------------------------------------------------------
/server_internal.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "vhost/server.h"
 4 | 
 5 | struct vhd_io_handler;
 6 | /* Add io handler to vhost control event loop */
 7 | struct vhd_io_handler *vhd_add_vhost_io_handler(int fd, int (*read)(void *),
 8 |                                                 void *opaque);
 9 | 
10 | struct vhd_request_queue;
11 | /* Add io handler to request queue event loop */
12 | struct vhd_io_handler *vhd_add_rq_io_handler(struct vhd_request_queue *rq,
13 |                                              int fd, int (*read)(void *),
14 |                                              void *opaque);
15 | 
16 | struct vhd_vdev;
17 | struct vhd_io;
18 | struct vhd_vring;
19 | 
20 | /**
21 |  * Enqueue IO request
22 |  */
23 | int vhd_enqueue_request(struct vhd_request_queue *rq,
24 |                         struct vhd_io *io);
25 | 
26 | void vhd_cancel_queued_requests(struct vhd_request_queue *rq,
27 |                                 const struct vhd_vring *vring);
28 | 
29 | /**
30 |  * Run callback in request queue
31 |  */
32 | void vhd_run_in_rq(struct vhd_request_queue *rq, void (*cb)(void *),
33 |                    void *opaque);
34 | 
35 | /*
36 |  * Run callback in vhost control event loop
37 |  */
38 | void vhd_run_in_ctl(void (*cb)(void *), void *opaque);
39 | 
40 | /*
41 |  * Submit a work item onto vhost control event loop and wait till it's
42 |  * finished.
43 |  */
44 | struct vhd_work;
45 | int vhd_submit_ctl_work_and_wait(void (*func)(struct vhd_work *, void *),
46 |                                  void *opaque);
47 | 
48 | bool vhd_in_ctl_thread(void);
49 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15)
 2 | 
 3 | project(yc-libvhost-server C)
 4 | 
 5 | if(UNIX AND NOT APPLE)
 6 |   set(LINUX TRUE)
 7 | endif()
 8 | 
 9 | if(NOT LINUX)
10 |   message(FATAL_ERROR "Unsupported platform")
11 | endif()
12 | 
13 | set(LIBVHOST_LOG_VERBOSITY "LOG_INFO" CACHE STRING "Libvhost log verbosity")
14 | message("Compiler ${CMAKE_C_COMPILER}")
15 | message("Libvhost log verbosity: ${LIBVHOST_LOG_VERBOSITY}")
16 | 
17 | add_library(vhost-server)
18 | add_compile_definitions(_GNU_SOURCE LOG_VERBOSITY=${LIBVHOST_LOG_VERBOSITY})
19 | target_compile_options(vhost-server PRIVATE
20 |   -Wall
21 |   -Werror
22 |   -Wextra
23 |   -Wno-unused-parameter
24 |   -g
25 |   -O2
26 | 
27 |   # make these warnings non-fatal in gcc
28 |   $<$<C_COMPILER_ID:GNU>:
29 |     -Wno-error=unused-value
30 |     -Wno-error=unused-result
31 |     -Wno-error=strict-aliasing
32 |   >
33 | 
34 |   # enable additional warnings to enforce coding standards
35 |   -Wmissing-prototypes
36 |   -Wmissing-declarations
37 |   $<$<C_COMPILER_ID:Clang>:
38 |     -Wmissing-variable-declarations
39 |     -Wzero-length-array
40 |   >
41 |   $<$<C_COMPILER_ID:GNU>:
42 |     -Wzero-length-bounds
43 |   >
44 | )
45 | target_include_directories(vhost-server PUBLIC
46 |   include
47 | )
48 | target_include_directories(vhost-server PRIVATE
49 |   ./
50 | )
51 | target_sources(vhost-server PRIVATE
52 |   blockdev.c
53 |   event.c
54 |   fs.c
55 |   logging.c
56 |   memlog.c
57 |   memmap.c
58 |   server.c
59 |   vdev.c
60 |   platform.c
61 |   virtio/virt_queue.c
62 |   virtio/virtio_blk.c
63 |   virtio/virtio_fs.c
64 | )
65 | 


--------------------------------------------------------------------------------
/tests/meson.build:
--------------------------------------------------------------------------------
 1 | libaio = cc.find_library('aio', required: true)
 2 | libpthread = cc.find_library('pthread', required: true)
 3 | 
 4 | vhost_user_blk_test_server_includes = include_directories(
 5 |     '../'
 6 | )
 7 | 
 8 | vhost_user_blk_test_server = executable(
 9 |     'vhost-user-blk-test-server',
10 |     'vhost_user_blk_test_server.c',
11 |     link_with: libvhost,
12 |     dependencies: [libaio, libpthread],
13 |     include_directories: [
14 |         vhost_user_blk_test_server_includes,
15 |         libvhost_includes
16 |     ]
17 | )
18 | 
19 | # If libblkio is disabled, we have no client to run
20 | # against vhost-user-blk-test-server, so nothing to do here.
21 | if not libblkio_proj.found()
22 |   subdir_done()
23 | endif
24 | 
25 | # If libblkio subproject doesn't define blkio_bench, this yields
26 | # a fatal error. It is OK as we pull a specific libblkio revision
27 | # which is known to define blkio_bench, and if it is missing,
28 | # something is certainly wrong.
29 | libblkio_bench_dep = libblkio_proj.get_variable('blkio_bench')
30 | 
31 | envdata = environment()
32 | envdata.append(
33 |     'TEST_SERVER_BINARY',
34 |     vhost_user_blk_test_server.full_path()
35 | )
36 | envdata.append(
37 |     'BLKIO_BENCH_BINARY',
38 |     libblkio_bench_dep.full_path()
39 | )
40 | 
41 | test(
42 |     'unit-tests',
43 |     import('python').find_installation('python3', modules: ['pytest']),
44 |     args: ['-m', 'pytest', '-rsv'],
45 |     depends: [vhost_user_blk_test_server, libblkio_bench_dep],
46 |     env: envdata,
47 |     workdir: meson.current_source_dir(),
48 |     timeout: 150,
49 |     is_parallel: false,
50 | )
51 | 


--------------------------------------------------------------------------------
/logging.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "platform.h"
 4 | 
 5 | #include "vhost/server.h"
 6 | 
 7 | extern log_function __attribute__((format(printf, 2, 3))) g_log_fn;
 8 | 
 9 | #define VHD_LOG(level, fmt, ...)                                \
10 |     do {                                                        \
11 |         if (g_log_fn) {                                         \
12 |             g_log_fn(level, "%s:%d: " fmt,                      \
13 |                      __func__, __LINE__, ##__VA_ARGS__);        \
14 |         }                                                       \
15 |     } while (0)
16 | 
17 | #ifndef VHD_NO_DEBUG_LOGS
18 | #   define VHD_LOG_DEBUG(fmt, ...) VHD_LOG(LOG_DEBUG, fmt, ##__VA_ARGS__)
19 | #else
20 | #   define VHD_LOG_DEBUG(fmt, ...)
21 | #endif
22 | 
23 | #define VHD_LOG_INFO(fmt, ...)     VHD_LOG(LOG_INFO, fmt, ##__VA_ARGS__)
24 | #define VHD_LOG_WARN(fmt, ...)     VHD_LOG(LOG_WARNING, fmt, ##__VA_ARGS__)
25 | #define VHD_LOG_ERROR(fmt, ...)    VHD_LOG(LOG_ERROR, fmt, ##__VA_ARGS__)
26 | 
27 | /*
28 |  * Generic helpers to produce log messages tagged by an object.  For that, the
29 |  * object must provide duck-typed interface of ->log_tag field of type "const
30 |  * char *".
31 |  */
32 | #define VHD_OBJ_DEBUG(obj, fmt, ...) \
33 |     VHD_LOG_DEBUG("%s: " fmt, obj->log_tag, ##__VA_ARGS__)
34 | #define VHD_OBJ_INFO(obj, fmt, ...) \
35 |     VHD_LOG_INFO("%s: " fmt, obj->log_tag, ##__VA_ARGS__)
36 | #define VHD_OBJ_WARN(obj, fmt, ...) \
37 |     VHD_LOG_WARN("%s: " fmt, obj->log_tag, ##__VA_ARGS__)
38 | #define VHD_OBJ_ERROR(obj, fmt, ...) \
39 |     VHD_LOG_ERROR("%s: " fmt, obj->log_tag, ##__VA_ARGS__)
40 | 
41 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
 1 | project(
 2 |     'yc-libvhost-server', 'c'
 3 | )
 4 | 
 5 | libvhost_log_verbosity = 'LOG_INFO'
 6 | 
 7 | libvhost_includes = include_directories(
 8 |     'include'
 9 | )
10 | 
11 | libvhost_sources = files([
12 |     'blockdev.c',
13 |     'event.c',
14 |     'fs.c',
15 |     'logging.c',
16 |     'memlog.c',
17 |     'memmap.c',
18 |     'server.c',
19 |     'vdev.c',
20 |     'platform.c',
21 |     'virtio/virtio_blk.c',
22 |     'virtio/virtio_fs.c',
23 |     'virtio/virt_queue.c'
24 | ])
25 | 
26 | libvhost_args = [
27 |     '-Wall',
28 |     '-Werror',
29 |     '-Wextra',
30 |     '-Wno-unused-parameter',
31 |     '-g',
32 |     '-O2',
33 | ]
34 | 
35 | cc = meson.get_compiler('c')
36 | 
37 | if cc.get_id() == 'gcc'
38 |     libvhost_args += [
39 |         '-Wno-error=unused-value',
40 |         '-Wno-error=unused-result',
41 |         '-Wno-error=strict-aliasing',
42 |     ]
43 | endif
44 | 
45 | libvhost_optional_args = cc.get_supported_arguments(
46 |     '-Wmissing-prototypes',
47 |     '-Wmissing-variable-declarations',
48 |     '-Wzero-length-array',
49 |     '-Wzero-length-bounds',
50 | )
51 | 
52 | libvhost_defines = [
53 |     '-D_GNU_SOURCE',
54 |     '-DLOG_VERBOSITY=' + libvhost_log_verbosity
55 | ]
56 | 
57 | libvhost = static_library(
58 |     'vhost',
59 |     sources: libvhost_sources,
60 |     include_directories: libvhost_includes,
61 |     c_args: libvhost_args + libvhost_optional_args + libvhost_defines,
62 | )
63 | 
64 | libblkio_proj = subproject(
65 |     'libblkio',
66 |     default_options: [
67 |       'subproject-docs=disabled',
68 |       'subproject-examples=enabled', # used in libvhost tests
69 |       'subproject-tests=disabled'
70 |     ],
71 |     required: get_option('libblkio')
72 | )
73 | subdir('tests')
74 | 


--------------------------------------------------------------------------------
/queue.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Lists and queues.
 3 |  *
 4 |  * Relevant OSes provide BSD-originated sys/queue.h, so just use it here, with
 5 |  * a few extensions.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <sys/queue.h>
11 | #include "catomic.h"
12 | 
13 | /*
14 |  * Atomically insert a new list head
15 |  */
16 | #define SLIST_INSERT_HEAD_ATOMIC(head, elm, field)      ({                        \
17 |     typeof(elm) old_slh_first;                                                    \
18 |     do {                                                                          \
19 |         /* Grab the current head and make the new element point to it */          \
20 |         (elm)->field.sle_next = catomic_read(&(head)->slh_first);                 \
21 |         old_slh_first = (elm)->field.sle_next;                                    \
22 |                                                                                   \
23 |         /* Repeat until slh_first matches old_slh_first at the time of cmpxchg */ \
24 |     } while (catomic_cmpxchg(&(head)->slh_first, old_slh_first, (elm)) !=         \
25 |              old_slh_first);                                                      \
26 |     old_slh_first;      })
27 | 
28 | /*
29 |  * Atomically move the list into 'dest' leaving 'src' empty
30 |  */
31 | #define SLIST_MOVE_ATOMIC(dest, src) do {                            \
32 |     (dest)->slh_first = catomic_xchg(&(src)->slh_first, NULL);       \
33 | } while (0)
34 | 
35 | /*
36 |  * Read the current list head with consume
37 |  */
38 | #define SLIST_FIRST_RCU(head)       catomic_rcu_read(&(head)->slh_first)
39 | 
40 | #define LIST_FOREACH_SAFE(elm, head, field, tmp_elm)       \
41 |     for ((elm) = ((head)->lh_first);                       \
42 |         (elm) && ((tmp_elm) = LIST_NEXT((elm), field), 1); \
43 |         (elm) = (tmp_elm))
44 | 


--------------------------------------------------------------------------------
/include/vhost/fs.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "vhost/types.h"
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | struct vhd_io;
10 | struct vhd_request_queue;
11 | struct vhd_vdev;
12 | 
13 | /**
14 |  * Client-supplied file system definition.
15 |  */
16 | struct vhd_fsdev_info {
17 |     /* Path to create listen sockets */
18 |     const char *socket_path;
19 | 
20 |     /* Device tag (file system name visible to the guest) */
21 |     const char *tag;
22 | 
23 |     /* Total number of backend queues this device supports */
24 |     uint32_t num_queues;
25 | };
26 | 
27 | /**
28 |  * In-flight file system io request
29 |  */
30 | struct vhd_fs_io {
31 |     struct vhd_sglist sglist;
32 | };
33 | 
34 | struct vhd_fs_io *vhd_get_fs_io(struct vhd_io *io);
35 | 
36 | /**
37 |  * Register vhost file system.
38 |  *
39 |  * After registering device will be accessible through vhost socket to client.
40 |  * All requests are submitted to attacher request queue for caller to process.
41 |  *
42 |  * @fsdev       Caller file system device info.
43 |  * @rq          Request queue to use for dispatch device I/O requests.
44 |  * @priv        Caller private data to associate with resulting vdev.
45 |  */
46 | struct vhd_vdev *vhd_register_fs(struct vhd_fsdev_info *fsdev,
47 |                                  struct vhd_request_queue *rq,
48 |                                  void *priv);
49 | 
50 | struct vhd_vdev *vhd_register_fs_mq(struct vhd_fsdev_info *fsdev,
51 |                                     struct vhd_request_queue **rqs,
52 |                                     int num_rqs,
53 |                                     void *priv);
54 | 
55 | /**
56 |  * Unregister vhost file system.
57 |  */
58 | void vhd_unregister_fs(struct vhd_vdev *vdev,
59 |                        void (*unregister_complete)(void *),
60 |                        void *arg);
61 | 
62 | #ifdef __cplusplus
63 | }
64 | #endif
65 | 


--------------------------------------------------------------------------------
/include/vhost/types.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Common types' definitions
 3 |  */
 4 | 
 5 | #pragma once
 6 | 
 7 | #include <stdbool.h>
 8 | #include <stddef.h>
 9 | #include <stdint.h>
10 | #include <time.h>
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | struct vhd_buffer {
17 |     void *base;
18 |     size_t len;
19 | 
20 |     /* Buffer is write-only if true and read-only if false */
21 |     bool write_only;
22 | };
23 | 
24 | struct vhd_sglist {
25 |     uint32_t nbuffers;
26 |     struct vhd_buffer *buffers;
27 | };
28 | 
29 | /**
30 |  * virtqueue usage statistics
31 |  */
32 | struct vhd_vq_metrics {
33 |     /* Dispatch counters */
34 |     /* number of times vring was processed */
35 |     uint64_t dispatch_total;
36 |     /* number of times vring was empty on processing */
37 |     uint64_t dispatch_empty;
38 | 
39 |     /* Request counters */
40 |     /* total amount of requests processed */
41 |     uint64_t request_total;
42 |     /* total amount of requests completed */
43 |     uint64_t request_completed;
44 | 
45 |     /* Other counters*/
46 |     /* number of requests was dispatched from vring last time*/
47 |     uint16_t queue_len_last;
48 |     /* max queue len was processed during 60s period */
49 |     uint16_t queue_len_max_60s;
50 | };
51 | 
52 | /**
53 |  * request queue usage statistics
54 |  */
55 | struct vhd_rq_metrics {
56 |     /* number of requests read from guest and put to internal queue */
57 |     uint64_t enqueued;
58 |     /* number of requests dispatched for handling */
59 |     uint64_t dequeued;
60 |     /* number of requests completed externally and scheduled for completion in rq */
61 |     uint64_t completions_received;
62 |     /* number of requests completed and reported to guest */
63 |     uint64_t completed;
64 |     /* number of requests canceled from internal queue before dispatch */
65 |     uint64_t cancelled;
66 | 
67 |     /* timestamp of oldest infight request */
68 |     time_t oldest_inflight_ts;
69 | };
70 | 
71 | #ifdef __cplusplus
72 | }
73 | #endif
74 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   build-and-run-tests:
 6 |     runs-on: ubuntu-latest
 7 | 
 8 |     steps:
 9 |       - name: Checkout the repository
10 |         uses: actions/checkout@v3
11 | 
12 |       - name: Install libraries
13 |         run: |
14 |           sudo apt update
15 |           sudo apt install python3 python3-docutils meson clang libaio-dev rustc cargo
16 |           python3 -m pip install pytest
17 |           rustup default 1.89.0
18 |           rustup component add clippy
19 | 
20 |       - name: Build
21 |         run: |
22 |           src_dir="${{ github.workspace }}"
23 |           build_dir="$src_dir/build"
24 |           CC=clang meson setup $build_dir $src_dir
25 |           pushd $build_dir
26 |           ninja
27 |           popd
28 | 
29 |       - name: Run Tests
30 |         run: |
31 |           python3 -m pytest ${{ github.workspace }}/tests/test_libvhost.py -rsv --junitxml result.xml
32 | 
33 |       - name: Collect test results
34 |         uses: mikepenz/action-junit-report@v3
35 |         if: always()
36 |         with:
37 |           report_paths: result.xml
38 |   build-with-cmake:
39 |     runs-on: ubuntu-latest
40 | 
41 |     steps:
42 |       - name: Checkout the repository
43 |         uses: actions/checkout@v3
44 | 
45 |       - name: Install libraries
46 |         run: |
47 |           sudo apt update
48 |           sudo apt install cmake ninja-build clang
49 | 
50 |       - name: Build
51 |         run: |
52 |           src_dir="${{ github.workspace }}"
53 |           build_dir="$src_dir/build"
54 |           cmake -S $src_dir -B $build_dir -G Ninja -DCMAKE_C_COMPILER=clang
55 |           ninja -C $build_dir
56 |   lint-python-scripts:
57 |     runs-on: ubuntu-latest
58 | 
59 |     strategy:
60 |       fail-fast: true
61 | 
62 |     steps:
63 |       - uses: actions/checkout@v3
64 | 
65 |       - name: Install flake8 & mypy
66 |         run: |
67 |           sudo apt update
68 |           sudo apt install python3 python3-pip
69 |           pip install flake8 mypy pytest
70 | 
71 |       - name: Run flake8
72 |         run: flake8 tests/*.py
73 | 
74 |       - name: Run mypy
75 |         run: mypy --disallow-incomplete-defs --no-implicit-optional tests/*.py
76 | 


--------------------------------------------------------------------------------
/objref.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Generic reference counting infrastructure.
 3 |  *
 4 |  * Note: refcounts need more relaxed memory ordering that regular atomics.
 5 |  *
 6 |  * The increments provide no ordering, because it's expected that the object is
 7 |  * held by something else that provides ordering.
 8 |  *
 9 |  * The decrements provide release order, such that all the prior loads and
10 |  * stores will be issued before, it also provides a control dependency, which
11 |  * will order against the subsequent free().
12 |  *
13 |  * The control dependency is against the load of the cmpxchg (ll/sc) that
14 |  * succeeded. This means the stores aren't fully ordered, but this is fine
15 |  * because the 1->0 transition indicates no concurrency.
16 |  *
17 |  * The decrements dec_and_test() and sub_and_test() also provide acquire
18 |  * ordering on success.
19 |  */
20 | 
21 | #pragma once
22 | 
23 | #include <stdbool.h>
24 | #include "catomic.h"
25 | #include "platform.h"
26 | 
27 | struct objref {
28 |     unsigned long refcount;
29 |     void (*release)(struct objref *objref);
30 | };
31 | 
32 | static inline void objref_init(struct objref *objref,
33 |                                void (*release)(struct objref *objref))
34 | {
35 |     objref->release = release;
36 |     catomic_set(&objref->refcount, 1);
37 | }
38 | 
39 | static inline unsigned int objref_read(struct objref *objref)
40 | {
41 |     return catomic_read(&objref->refcount);
42 | }
43 | 
44 | static inline void refcount_inc(unsigned long *ptr)
45 | {
46 |     __atomic_fetch_add(ptr, 1, __ATOMIC_RELAXED);
47 | }
48 | 
49 | static inline void objref_get(struct objref *objref)
50 | {
51 |     refcount_inc(&objref->refcount);
52 | }
53 | 
54 | static inline bool refcount_dec_and_test(unsigned long *ptr)
55 | {
56 |     const int memory_order =
57 |     #if VHD_HAS_FEATURE(thread_sanitizer)
58 |         __ATOMIC_ACQ_REL;
59 |     #else
60 |         __ATOMIC_RELEASE;
61 |     #endif
62 |     unsigned long old = __atomic_fetch_sub(ptr, 1, memory_order);
63 | 
64 |     if (old == 1) {
65 |         smp_mb_acquire();
66 |         return true;
67 |     }
68 |     return false;
69 | }
70 | 
71 | /*
72 |  * Decrement refcount for object, and call @release if it drops to zero.
73 |  * Return true if the object was removed, otherwise return false.
74 |  * Note: only "true" is trustworthy, "false" doesn't prevent another thread
75 |  * from releasing the object.
76 |  */
77 | static inline bool objref_put(struct objref *objref)
78 | {
79 |     if (refcount_dec_and_test(&objref->refcount)) {
80 |         objref->release(objref);
81 |         return true;
82 |     }
83 |     return false;
84 | }
85 | 


--------------------------------------------------------------------------------
/virtio/virtio_spec.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Definitions from virtio spec version 1.0
  3 |  * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html.
  4 |  *
  5 |  * Type naming and style is preserved verbatim from virtio spec.
  6 |  */
  7 | 
  8 | #pragma once
  9 | 
 10 | #include "platform.h"
 11 | #include "virtio_types.h"
 12 | 
 13 | #ifdef __cplusplus
 14 | extern "C" {
 15 | #endif
 16 | 
 17 | #define VIRTQ_SIZE_MAX  32768u
 18 | 
 19 | struct virtq_desc {
 20 |     /* Address (guest-physical). */
 21 |     le64 addr;
 22 |     /* Length. */
 23 |     le32 len;
 24 | 
 25 |     /* This marks a buffer as continuing via the next field. */
 26 | #define VIRTQ_DESC_F_NEXT       1
 27 |     /* This marks a buffer as device write-only (otherwise device read-only). */
 28 | #define VIRTQ_DESC_F_WRITE      2
 29 |     /* This means the buffer contains a list of buffer descriptors. */
 30 | #define VIRTQ_DESC_F_INDIRECT   4
 31 |     /* The flags as indicated above. */
 32 |     le16 flags;
 33 |     /* Next field if flags & NEXT */
 34 |     le16 next;
 35 | };
 36 | VHD_STATIC_ASSERT(sizeof(struct virtq_desc) == 16);
 37 | 
 38 | struct virtq_avail {
 39 | #define VIRTQ_AVAIL_F_NO_INTERRUPT      1
 40 |     le16 flags;
 41 |     le16 idx;
 42 |     le16 ring[]; /* Queue Size */
 43 |     /* le16 used_event; Only if VIRTIO_F_EVENT_IDX */
 44 | };
 45 | VHD_STATIC_ASSERT(sizeof(struct virtq_avail) == 4);
 46 | 
 47 | /* le32 is used here for ids for padding reasons. */
 48 | struct virtq_used_elem {
 49 |     /* Index of start of used descriptor chain. */
 50 |     le32 id;
 51 |     /*
 52 |      * The number of bytes written into the device writable portion of
 53 |      * the buffer described by the descriptor chain.
 54 |      */
 55 |     le32 len;
 56 | };
 57 | VHD_STATIC_ASSERT(sizeof(struct virtq_used_elem) == 8);
 58 | 
 59 | struct virtq_used {
 60 | #define VIRTQ_USED_F_NO_NOTIFY  1
 61 |     le16 flags;
 62 |     le16 idx;
 63 |     struct virtq_used_elem ring[]; /* Queue Size */
 64 |     /* le16 avail_event; Only if VIRTIO_F_EVENT_IDX */
 65 | };
 66 | VHD_STATIC_ASSERT(sizeof(struct virtq_used) == 4);
 67 | 
 68 | /*
 69 |  * Virtqueue layout cannot be represented by a C struct,
 70 |  * definition below is intentionally a comment.
 71 |  * struct virtq {
 72 |  *     // The actual descriptors (16 bytes each)
 73 |  *     struct virtq_desc desc[ Queue Size ];
 74 |  *
 75 |  *     // A ring of available descriptor heads with free-running index.
 76 |  *     struct virtq_avail avail;
 77 |  *     le16 used_event; // Only if VIRTIO_F_EVENT_IDX
 78 |  *
 79 |  *     // Padding to the next PAGE_SIZE boundary.
 80 |  *     u8 pad[ Padding ];
 81 |  *
 82 |  *     // A ring of used descriptor heads with free-running index.
 83 |  *     struct virtq_used used;
 84 |  *     le16 avail_event; // Only if VIRTIO_F_EVENT_IDX
 85 |  * };
 86 | */
 87 | 
 88 | static inline size_t virtq_align(size_t size)
 89 | {
 90 |     return (size + platform_page_size) & ~platform_page_size;
 91 | }
 92 | 
 93 | static inline unsigned virtq_size(unsigned int qsz)
 94 | {
 95 |     return virtq_align(sizeof(struct virtq_desc) * qsz +
 96 |                        sizeof(le16) * (3 + qsz))
 97 |          + virtq_align(sizeof(le16) * 3 +
 98 |                        sizeof(struct virtq_used_elem) * qsz);
 99 | }
100 | 
101 | #ifdef __cplusplus
102 | }
103 | #endif
104 | 


--------------------------------------------------------------------------------
/memlog.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <endian.h>
  3 | 
  4 | #include "catomic.h"
  5 | #include "logging.h"
  6 | #include "memlog.h"
  7 | #include "memmap.h"
  8 | 
  9 | struct vhd_memory_log {
 10 |     unsigned long *base;
 11 |     size_t size;
 12 | };
 13 | 
 14 | struct vhd_memory_log *vhd_memlog_new(size_t size, int fd, off_t offset)
 15 | {
 16 |     struct vhd_memory_log *log;
 17 |     void *base;
 18 | 
 19 |     base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset);
 20 |     if (base == MAP_FAILED) {
 21 |         VHD_LOG_ERROR("mmap(%zu, %d, %zu): %s", size, fd, offset,
 22 |                       strerror(errno));
 23 |         return NULL;
 24 |     }
 25 | 
 26 |     log = vhd_alloc(sizeof(*log));
 27 |     *log = (struct vhd_memory_log) {
 28 |         .base = base,
 29 |         .size = size,
 30 |     };
 31 |     return log;
 32 | }
 33 | 
 34 | void vhd_memlog_free(struct vhd_memory_log *log)
 35 | {
 36 |     munmap(log->base, log->size);
 37 |     vhd_free(log);
 38 | }
 39 | 
 40 | static void atomic_or_le_ulong(unsigned long *ptr, unsigned long mask)
 41 | {
 42 |     VHD_STATIC_ASSERT(sizeof(*ptr) == sizeof(uint64_t));
 43 |     catomic_or(ptr, htole64(mask));
 44 | }
 45 | 
 46 | static void bitmap_set_atomic(unsigned long *map, size_t start, size_t end)
 47 | {
 48 |     static const unsigned bits_per_word = sizeof(*map) * 8;
 49 |     size_t start_idx = start / bits_per_word;
 50 |     size_t end_idx = end / bits_per_word;
 51 |     size_t i;
 52 |     unsigned start_in_word = start % bits_per_word;
 53 |     unsigned end_in_word = end % bits_per_word;
 54 | 
 55 |     /* first partial word */
 56 |     if (start_in_word && start_idx < end_idx) {
 57 |         atomic_or_le_ulong(&map[start_idx], ~0UL << start_in_word);
 58 |         start_in_word = 0;
 59 |         start_idx++;
 60 |     }
 61 | 
 62 |     /* full words: no RMW so relaxed atomic; no endianness */
 63 |     for (i = start_idx; i < end_idx; i++) {
 64 |         catomic_set(&map[i], ~0UL);
 65 |     }
 66 | 
 67 |     /* last partial word */
 68 |     if (end_in_word) {
 69 |         unsigned nr_clear_bits = bits_per_word - (end_in_word - start_in_word);
 70 |         atomic_or_le_ulong(&map[end_idx],
 71 |                            (~0UL >> nr_clear_bits) << start_in_word);
 72 |     } else if (start_idx < end_idx) {
 73 |         /*
 74 |          * if there were any relaxed catomic_set's not followed by an implicit
 75 |          * full memory barrier in catomic_or, do an explicit one
 76 |          */
 77 |         smp_mb();
 78 |     }
 79 | }
 80 | 
 81 | #define VHOST_LOG_PAGE 0x1000
 82 | 
 83 | void vhd_mark_gpa_range_dirty(struct vhd_memory_log *log, uint64_t gpa,
 84 |                               size_t len)
 85 | {
 86 |     size_t start = gpa / VHOST_LOG_PAGE;
 87 |     size_t end = (gpa + len - 1) / VHOST_LOG_PAGE + 1;
 88 | 
 89 |     /* this is internal function, overflown ranges shouldn't reach here */
 90 |     VHD_ASSERT(gpa + len > gpa);
 91 | 
 92 |     if (end > log->size * 8) {
 93 |         VHD_LOG_ERROR("range 0x%zx-0x%zx beyond log size %zx", gpa,
 94 |                       gpa + len - 1, log->size);
 95 |         end = log->size * 8;
 96 |     }
 97 | 
 98 |     bitmap_set_atomic(log->base, start, end);
 99 | }
100 | 
101 | void vhd_mark_range_dirty(struct vhd_memory_log *log,
102 |                           struct vhd_memory_map *mm, void *ptr, size_t len)
103 | {
104 |     uint64_t gpa = ptr_to_gpa(mm, ptr);
105 |     if (gpa != TRANSLATION_FAILED) {
106 |         vhd_mark_gpa_range_dirty(log, gpa, len);
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/virtio/virtio_blk.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "virtio_blk_spec.h"
  4 | 
  5 | #ifdef __cplusplus
  6 | extern "C" {
  7 | #endif
  8 | 
  9 | #define VIRTIO_BLK_DEFAULT_FEATURES ((uint64_t)( \
 10 |     (1UL << VIRTIO_F_RING_INDIRECT_DESC) | \
 11 |     (1UL << VIRTIO_F_RING_EVENT_IDX) | \
 12 |     (1UL << VIRTIO_F_VERSION_1) | \
 13 |     (1UL << VIRTIO_BLK_F_SEG_MAX) | \
 14 |     (1UL << VIRTIO_BLK_F_GEOMETRY) | \
 15 |     (1UL << VIRTIO_BLK_F_BLK_SIZE) | \
 16 |     (1UL << VIRTIO_BLK_F_TOPOLOGY) | \
 17 |     (1UL << VIRTIO_BLK_F_MQ)))
 18 | 
 19 |     /*
 20 |      * TODO: can implement size_max and seg_max to better control request limits
 21 |      * (1UL << VIRTIO_BLK_F_SIZE_MAX) | \
 22 |      */
 23 | 
 24 | /*
 25 |  * Same as QEMU:
 26 |  * We support only one segment per request since multiple segments
 27 |  * are not widely used and there are no userspace APIs that allow
 28 |  * applications to submit multiple segments in a single call.
 29 | */
 30 | #define VIRTIO_BLK_MAX_DISCARD_SEGMENTS 1
 31 | #define VIRTIO_BLK_MAX_WRITE_ZEROES_SEGMENTS 1
 32 | 
 33 | /*
 34 |  * The config field is an 'le32', we just set it to the maximum
 35 |  * possible value as we don't really have any reasons to limit
 36 |  * it to a lower number here.
 37 |  */
 38 | #define VIRTIO_BLK_MAX_DISCARD_SECTORS UINT32_MAX
 39 | #define VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS UINT32_MAX
 40 | 
 41 | struct vhd_bdev_info;
 42 | struct vhd_io;
 43 | 
 44 | struct virtio_virtq;
 45 | struct virtio_blk_dev;
 46 | 
 47 | /**
 48 |  * Virtio block I/O dispatch function,
 49 |  * can be overriden for testing.
 50 |  */
 51 | __attribute__((weak))
 52 | int virtio_blk_handle_request(struct virtio_virtq *vq,
 53 |                               struct vhd_io *io);
 54 | 
 55 | /**
 56 |  * Virtio block device context
 57 |  */
 58 | struct virtio_blk_dev {
 59 |     char *serial;
 60 |     uint64_t features;
 61 | 
 62 |     /* blk config data generated on init from bdev */
 63 |     struct virtio_blk_config config;
 64 | };
 65 | 
 66 | /**
 67 |  * Init virtio blk device context from bdev info
 68 |  */
 69 | void virtio_blk_init_dev(
 70 |     struct virtio_blk_dev *dev,
 71 |     const struct vhd_bdev_info *bdev);
 72 | 
 73 | /**
 74 |  * Destroy virtio blk device context
 75 |  */
 76 | void virtio_blk_destroy_dev(struct virtio_blk_dev *dev);
 77 | 
 78 | /**
 79 |  * Dispatch requests from device virtq
 80 |  */
 81 | int virtio_blk_dispatch_requests(struct virtio_blk_dev *dev,
 82 |                                  struct virtio_virtq *vq);
 83 | 
 84 | /**
 85 |  * Get the virtio config
 86 |  */
 87 | size_t virtio_blk_get_config(struct virtio_blk_dev *dev, void *cfgbuf,
 88 |                              size_t bufsize, size_t offset);
 89 | 
 90 | /**
 91 |  * Get all supported virtio features
 92 |  */
 93 | uint64_t virtio_blk_get_features(struct virtio_blk_dev *dev);
 94 | 
 95 | /**
 96 |  * Check if @dev supports a given virtio feature.
 97 |  * @feature is the bit index, and not the mask
 98 |  */
 99 | bool virtio_blk_has_feature(struct virtio_blk_dev *dev, int feature);
100 | 
101 | /**
102 |  * Get readonly status
103 |  */
104 | bool virtio_blk_is_readonly(struct virtio_blk_dev *dev);
105 | 
106 | /**
107 |  * Get total_blocks
108 |  */
109 | uint64_t virtio_blk_get_total_blocks(struct virtio_blk_dev *dev);
110 | 
111 | /**
112 |  * Update virtio config for new @total_blocks.
113 |  */
114 | void virtio_blk_set_total_blocks(struct virtio_blk_dev *dev,
115 |                                  uint64_t total_blocks);
116 | 
117 | #ifdef __cplusplus
118 | }
119 | #endif
120 | 


--------------------------------------------------------------------------------
/catomic.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /*
 4 |  * This is a compiler barrier, it doesn't prevent the CPU from reordering loads
 5 |  * and stores in any way, but prevents compiler optimizations such as
 6 |  * reordering code around. Mostly used internally by this header to make other
 7 |  * helpers fully atomic.
 8 |  */
 9 | #define barrier()   __atomic_signal_fence(__ATOMIC_ACQ_REL)
10 | 
11 | /*
12 |  * Reportedly __atomic_thread_fence does not include a compiler barrier, so add
13 |  * one here.
14 |  */
15 | #define smp_mb()                        \
16 |     ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); })
17 | #define smp_mb_release()                \
18 |     ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); })
19 | #define smp_mb_acquire()                \
20 |     ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); })
21 | 
22 | /*
23 |  * Reportedly current compilers promote consume order to acquire and
24 |  * slow this down unnecessarily. This seems not to be the case on x86_64; need
25 |  * to recheck if we ever build for another arch.
26 |  */
27 | #if !defined(__x86_64__) && !defined(__aarch64__)
28 | #error Verify smp_read_barrier_depends incurs no extra costs
29 | #endif
30 | #define smp_read_barrier_depends()      \
31 |     ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); })
32 | 
33 | #define smp_wmb()   smp_mb_release()
34 | #define smp_rmb()   smp_mb_acquire()
35 | 
36 | #define catomic_read(ptr)       __atomic_load_n(ptr, __ATOMIC_RELAXED)
37 | #define catomic_set(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_RELAXED)
38 | 
39 | #define catomic_load_acquire(ptr)        \
40 |     __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
41 | #define catomic_store_release(ptr, val)  \
42 |     __atomic_store_n(ptr, val, __ATOMIC_RELEASE)
43 | 
44 | /*
45 |  * catomic_rcu_read potentially has the same issue with consume order as
46 |  * smp_read_barrier_depends, see above.
47 |  */
48 | #if !defined(__x86_64__) && !defined(__aarch64__)
49 | #error Verify catomic_rcu_read incurs no extra costs
50 | #endif
51 | #define catomic_rcu_read(ptr)      __atomic_load_n(ptr, __ATOMIC_CONSUME)
52 | #define catomic_rcu_set(ptr, val)  __atomic_store_n(ptr, val, __ATOMIC_RELEASE)
53 | 
54 | #define catomic_xchg(ptr, val)           \
55 |     __atomic_exchange_n(ptr, val, __ATOMIC_SEQ_CST)
56 | #define catomic_cmpxchg(ptr, old, new)    ({                                \
57 |     __auto_type _old = (old);                                               \
58 |     (void) __atomic_compare_exchange_n(ptr, &_old, new, false,              \
59 |                                        __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \
60 |     _old; })
61 | 
62 | #define catomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)
63 | #define catomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
64 | #define catomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
65 | #define catomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
66 | #define catomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
67 | 
68 | #define catomic_fetch_inc(ptr) catomic_fetch_add(ptr, 1)
69 | #define catomic_fetch_dec(ptr) catomic_fetch_sub(ptr, 1)
70 | 
71 | #define catomic_add(ptr, n) ((void) catomic_fetch_add(ptr, n))
72 | #define catomic_sub(ptr, n) ((void) catomic_fetch_sub(ptr, n))
73 | #define catomic_and(ptr, n) ((void) catomic_fetch_and(ptr, n))
74 | #define catomic_or(ptr, n)  ((void) catomic_fetch_or(ptr, n))
75 | #define catomic_xor(ptr, n) ((void) catomic_fetch_xor(ptr, n))
76 | #define catomic_inc(ptr)    ((void) catomic_fetch_inc(ptr))
77 | #define catomic_dec(ptr)    ((void) catomic_fetch_dec(ptr))
78 | 


--------------------------------------------------------------------------------
/fs.c:
--------------------------------------------------------------------------------
  1 | #include "vhost/fs.h"
  2 | #include "virtio/virtio_fs.h"
  3 | 
  4 | #include "bio.h"
  5 | #include "logging.h"
  6 | #include "server_internal.h"
  7 | #include "vdev.h"
  8 | 
  9 | /******************************************************************************/
 10 | 
 11 | struct vhd_fsdev {
 12 |     /* Base vdev */
 13 |     struct vhd_vdev vdev;
 14 | 
 15 |     /* VM-facing interface type */
 16 |     struct virtio_fs_dev vfs;
 17 | 
 18 |     LIST_ENTRY(vhd_fsdev) fsdevs;
 19 | };
 20 | 
 21 | static LIST_HEAD(, vhd_fsdev) g_fsdev_list = LIST_HEAD_INITIALIZER(g_fsdev_list);
 22 | 
 23 | #define VHD_FSDEV_FROM_VDEV(ptr) containerof(ptr, struct vhd_fsdev, vdev)
 24 | 
 25 | /******************************************************************************/
 26 | 
 27 | static uint64_t vfs_get_features(struct vhd_vdev *vdev)
 28 | {
 29 |     return VIRTIO_FS_DEFAULT_FEATURES;
 30 | }
 31 | 
 32 | static int vfs_set_features(struct vhd_vdev *vdev, uint64_t features)
 33 | {
 34 |     return 0;
 35 | }
 36 | 
 37 | static size_t vfs_get_config(struct vhd_vdev *vdev, void *cfgbuf,
 38 |                              size_t bufsize, size_t offset)
 39 | {
 40 |     struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev);
 41 | 
 42 |     if (offset >= sizeof(dev->vfs.config)) {
 43 |         return 0;
 44 |     }
 45 | 
 46 |     size_t data_size = MIN(bufsize, sizeof(dev->vfs.config) - offset);
 47 | 
 48 |     memcpy(cfgbuf, (char *)(&dev->vfs.config) + offset, data_size);
 49 | 
 50 |     return data_size;
 51 | }
 52 | 
 53 | static int vfs_dispatch_requests(struct vhd_vdev *vdev,
 54 |                                  struct vhd_vring *vring)
 55 | {
 56 |     struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev);
 57 |     return virtio_fs_dispatch_requests(&dev->vfs, &vring->vq);
 58 | }
 59 | 
 60 | static void vfs_free(struct vhd_vdev *vdev)
 61 | {
 62 |     struct vhd_fsdev *dev = VHD_FSDEV_FROM_VDEV(vdev);
 63 | 
 64 |     LIST_REMOVE(dev, fsdevs);
 65 |     vhd_free(dev);
 66 | }
 67 | 
 68 | static const struct vhd_vdev_type g_virtio_fs_vdev_type = {
 69 |     .desc               = "virtio-fs",
 70 |     .get_features       = vfs_get_features,
 71 |     .set_features       = vfs_set_features,
 72 |     .get_config         = vfs_get_config,
 73 |     .dispatch_requests  = vfs_dispatch_requests,
 74 |     .free               = vfs_free,
 75 | };
 76 | 
 77 | /******************************************************************************/
 78 | 
 79 | struct vhd_vdev *vhd_register_fs(struct vhd_fsdev_info *fsdev,
 80 |                                  struct vhd_request_queue *rq,
 81 |                                  void *priv)
 82 | {
 83 |     return vhd_register_fs_mq(fsdev, &rq, 1, priv);
 84 | }
 85 | 
 86 | struct vhd_vdev *vhd_register_fs_mq(struct vhd_fsdev_info *fsdev,
 87 |                                     struct vhd_request_queue **rqs,
 88 |                                     int num_rqs,
 89 |                                     void *priv)
 90 | {
 91 |     VHD_VERIFY(fsdev);
 92 |     VHD_VERIFY(rqs);
 93 | 
 94 |     struct vhd_fsdev *dev = vhd_zalloc(sizeof(*dev));
 95 | 
 96 |     int res = virtio_fs_init_dev(&dev->vfs, fsdev);
 97 |     if (res != 0) {
 98 |         goto error_out;
 99 |     }
100 | 
101 |     res = vhd_vdev_init_server(&dev->vdev, fsdev->socket_path, &g_virtio_fs_vdev_type,
102 |                                fsdev->num_queues, rqs, num_rqs, priv, NULL, NULL, 0);
103 |     if (res != 0) {
104 |         goto error_out;
105 |     }
106 | 
107 |     LIST_INSERT_HEAD(&g_fsdev_list, dev, fsdevs);
108 |     return &dev->vdev;
109 | 
110 | error_out:
111 |     vhd_free(dev);
112 |     return NULL;
113 | }
114 | 
115 | void vhd_unregister_fs(struct vhd_vdev *vdev,
116 |                        void (*unregister_complete)(void *),
117 |                        void *arg)
118 | {
119 |     vhd_vdev_stop_server(vdev, unregister_complete, arg);
120 | }
121 | 


--------------------------------------------------------------------------------
/include/vhost/server.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include "vhost/types.h"
  6 | 
  7 | #ifdef __cplusplus
  8 | extern "C" {
  9 | #endif
 10 | 
 11 | #define VHD_MAX_REQUEST_QUEUES 256
 12 | 
 13 | struct vhd_vdev;
 14 | struct vhd_io;
 15 | 
 16 | /**
 17 |  * Logging support
 18 |  */
 19 | 
 20 | enum LogLevel {
 21 |     LOG_ERROR = 0,
 22 |     LOG_WARNING = 1,
 23 |     LOG_INFO = 2,
 24 |     LOG_DEBUG = 3
 25 | };
 26 | 
 27 | typedef void (*log_function)(enum LogLevel level, const char *format, ...);
 28 | 
 29 | /**
 30 |  * Start vhost server
 31 |  *
 32 |  * Server will spawn one native thread to wait for incoming vhost handshakes.
 33 |  * This thread will only handle global vhost protocol communication.
 34 |  * Device I/O events are handled separately by plugging into request queues.
 35 |  *
 36 |  * Return 0 on success or negative error code.
 37 |  */
 38 | int vhd_start_vhost_server(log_function log_fn);
 39 | 
 40 | /**
 41 |  * Stop vhost server
 42 |  *
 43 |  * Stop vhost event thread which means no new vhost connections are possible
 44 |  */
 45 | void vhd_stop_vhost_server(void);
 46 | 
 47 | /**
 48 |  * Request instance stored in request queue
 49 |  */
 50 | struct vhd_request {
 51 |     /* Device that generated this request */
 52 |     struct vhd_vdev *vdev;
 53 | 
 54 |     /* Device type-specific request data */
 55 |     struct vhd_io *io;
 56 | };
 57 | 
 58 | /**
 59 |  * Server request queue
 60 |  *
 61 |  * Request queues are created by client and attached to vhost device(s).
 62 |  * Each device will then send its events to its attched queue.
 63 |  * This way request queues serve as a unit of load balancing.
 64 |  */
 65 | struct vhd_request_queue;
 66 | 
 67 | /**
 68 |  * Create new request queue
 69 |  */
 70 | struct vhd_request_queue *vhd_create_request_queue(void);
 71 | 
 72 | /**
 73 |  * Destroy request queue.
 74 |  * Don't call this until there are devices attached to this queue.
 75 |  */
 76 | void vhd_release_request_queue(struct vhd_request_queue *rq);
 77 | 
 78 | /**
 79 |  * Run queue in calling thread.
 80 |  * Will block until any of the devices enqueue requests.
 81 |  * Returns:
 82 |  *    0         - when the request queue shouldn't be running any more
 83 |  *   -EAGAIN    - when the request should be running further
 84 |  *   <0         - on other errors
 85 |  */
 86 | int vhd_run_queue(struct vhd_request_queue *rq);
 87 | 
 88 | /**
 89 |  * Unblock running request queue.
 90 |  * After calling this vhd_run_queue will eventually return and can the be
 91 |  * reeintered.
 92 |  */
 93 | void vhd_stop_queue(struct vhd_request_queue *rq);
 94 | 
 95 | /**
 96 |  * Dequeue next request.
 97 |  */
 98 | bool vhd_dequeue_request(struct vhd_request_queue *rq,
 99 |                          struct vhd_request *out_req);
100 | 
101 | /**
102 |  * Get request queue metrics.
103 |  */
104 | void vhd_get_rq_stat(struct vhd_request_queue *rq,
105 |                      struct vhd_rq_metrics *metrics);
106 | 
107 | /**
108 |  * Block io request result
109 |  */
110 | enum vhd_bdev_io_result {
111 |     VHD_BDEV_SUCCESS = 0,
112 |     VHD_BDEV_IOERR,
113 |     VHD_BDEV_CANCELED,
114 | };
115 | 
116 | /*
117 |  * Complete the processing of the request.  The backend calls this to indicate
118 |  * that it's done with the request and the library may signal completion to the
119 |  * guest driver and dispose of the request.
120 |  */
121 | void vhd_complete_bio(struct vhd_io *io, enum vhd_bdev_io_result status);
122 | 
123 | /**
124 |  * Get private data associated with vdev.
125 |  */
126 | void *vhd_vdev_get_priv(struct vhd_vdev *vdev);
127 | 
128 | /**
129 |  * Get statistics for device's virtio queue.
130 |  */
131 | int vhd_vdev_get_queue_stat(struct vhd_vdev *vdev, uint32_t queue_num,
132 |                             struct vhd_vq_metrics *metrics);
133 | 
134 | #ifdef __cplusplus
135 | }
136 | #endif
137 | 


--------------------------------------------------------------------------------
/event.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdbool.h>
  4 | 
  5 | #ifdef __cplusplus
  6 | extern "C" {
  7 | #endif
  8 | 
  9 | #define VHD_EVENT_LOOP_DEFAULT_MAX_EVENTS 32
 10 | 
 11 | /**
 12 |  * Event loop instance
 13 |  *
 14 |  * Each event loop will run in a thread which calls vhd_run_event_loop.
 15 |  * Events detected in given event loop iteration will also be handled in this
 16 |  * thread.
 17 |  *
 18 |  * Event loop management operations (add/remove events) are thread-safe,
 19 |  * although changes to list of events may not be visible until next
 20 |  * vhd_run_event_loop.
 21 |  */
 22 | struct vhd_event_loop;
 23 | 
 24 | /**
 25 |  * Create new event loop.
 26 |  * @max_events      How many events we can handle in one iteration.
 27 |  *                  Events are reported in FIFO order to avoid starvation.
 28 |  */
 29 | struct vhd_event_loop *vhd_create_event_loop(size_t max_events);
 30 | 
 31 | /**
 32 |  * Free event loop.
 33 |  */
 34 | void vhd_free_event_loop(struct vhd_event_loop *evloop);
 35 | 
 36 | /**
 37 |  * Run a single iteration of the event loop
 38 |  *
 39 |  * @timeout     0 to return immediately, -1 to block indefinitely, milliseconds
 40 |  *              value otherwise.
 41 |  *
 42 |  * @return      0 if the event loop is terminated upon request
 43 |  *              -EAGAIN if the event loop should keep going
 44 |  *              another negative code on error
 45 |  */
 46 | int vhd_run_event_loop(struct vhd_event_loop *evloop, int timeout_ms);
 47 | 
 48 | /**
 49 |  * Request event loop termination
 50 |  */
 51 | void vhd_terminate_event_loop(struct vhd_event_loop *evloop);
 52 | 
 53 | /* I/O handling to be associated with a file descriptor */
 54 | struct vhd_io_handler;
 55 | 
 56 | /*
 57 |  * Add io handler @read for @fd and attach it to @evloop.
 58 |  * For safe data access must be called in @evloop only.
 59 |  */
 60 | struct vhd_io_handler *vhd_add_io_handler(struct vhd_event_loop *evloop,
 61 |                                           int fd, int (*read)(void *),
 62 |                                           void *opaque);
 63 | 
 64 | /*
 65 |  * Stop monitoring io handler @handler's file descriptor and calling its
 66 |  * handler functions.
 67 |  * For safe data access must be called in @handler's event loop only.
 68 |  */
 69 | 
 70 | int vhd_detach_io_handler(struct vhd_io_handler *handler);
 71 | /*
 72 |  * Resume monitoring io handler @handler's file descriptor and calling its
 73 |  * handler functions.
 74 |  * For safe data access must be called in @handler's event loop only.
 75 |  */
 76 | int vhd_attach_io_handler(struct vhd_io_handler *handler);
 77 | 
 78 | /*
 79 |  * Detach io handler @handler from its event loop and delete it.
 80 |  * For safe data access must be called in @handler's event loop only.
 81 |  */
 82 | int vhd_del_io_handler(struct vhd_io_handler *handler);
 83 | 
 84 | /**
 85 |  * Clear eventfd after handling it
 86 |  */
 87 | void vhd_clear_eventfd(int fd);
 88 | 
 89 | /**
 90 |  * Trigger eventfd
 91 |  */
 92 | void vhd_set_eventfd(int fd);
 93 | 
 94 | struct vhd_bh;
 95 | typedef void vhd_bh_cb(void *opaque);
 96 | 
 97 | struct vhd_bh *vhd_bh_new(struct vhd_event_loop *ctx,
 98 |                           vhd_bh_cb *cb, void *opaque);
 99 | void vhd_bh_schedule_oneshot(struct vhd_event_loop *ctx,
100 |                              vhd_bh_cb *cb, void *opaque);
101 | void vhd_bh_schedule(struct vhd_bh *bh);
102 | void vhd_bh_cancel(struct vhd_bh *bh);
103 | void vhd_bh_delete(struct vhd_bh *bh);
104 | 
105 | /*
106 |  * Submit a work item onto @evloop and wait till it's finished.
107 |  * Must not be called in the target event loop.
108 |  *
109 |  * Returns exactly the value which user sets by vhd_complete_work(), no other
110 |  * errors possible.
111 |  */
112 | struct vhd_work;
113 | int vhd_submit_work_and_wait(struct vhd_event_loop *evloop,
114 |                              void (*func)(struct vhd_work *, void *),
115 |                              void *opaque);
116 | /*
117 |  * Signal work completion to the submitter
118 |  */
119 | void vhd_complete_work(struct vhd_work *work, int ret);
120 | 
121 | #ifdef __cplusplus
122 | }
123 | #endif
124 | 


--------------------------------------------------------------------------------
/virtio/virt_queue.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <pthread.h>
  4 | 
  5 | #include "vhost/types.h"
  6 | #include "vhost_spec.h"
  7 | 
  8 | #include "virtio_spec.h"
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | /**
 15 |  * Describes parsed buffer chain to be handled by virtio device type
 16 |  */
 17 | struct virtio_iov {
 18 |     uint16_t niov_out;
 19 |     uint16_t niov_in;
 20 |     struct vhd_buffer *iov_out;
 21 |     struct vhd_buffer *iov_in;
 22 |     struct vhd_buffer buffers[/* niov_out + niov_in */];
 23 | };
 24 | 
 25 | struct vhd_memory_map;
 26 | struct vhd_memory_log;
 27 | 
 28 | struct virtio_virtq {
 29 |     const char *log_tag;
 30 | 
 31 |     uint32_t flags;
 32 |     struct virtq_desc *desc;
 33 |     struct virtq_avail *avail;
 34 |     struct virtq_used *used;
 35 |     uint64_t used_gpa_base;
 36 | 
 37 |     /* Size of queue in number of descriptors it can hold */
 38 |     uint16_t qsz;
 39 | 
 40 |     /* Max chain length (for bug compatibility with non-compliant drivers) */
 41 |     uint16_t max_chain_len;
 42 | 
 43 |     /* Shadow avail ring index */
 44 |     uint16_t  last_avail;
 45 | 
 46 |     /*
 47 |      * 2.4.5.3.1: A driver MUST NOT create a descriptor chain longer than
 48 |      * the Queue Size of the device
 49 |      * Thus we can preallocate a scratch area of a known size to accumulate
 50 |      * scatter-gather segments before handing them over to the device.
 51 |      */
 52 |     uint16_t niov_out;
 53 |     uint16_t niov_in;
 54 |     struct vhd_buffer *buffers;
 55 | 
 56 |     /*
 57 |      * Virtqueue is broken, probably because there is an invalid descriptor
 58 |      * chain in it.
 59 |      * Broken status is sticky and so far cannot be repared.
 60 |      */
 61 |     bool broken;
 62 | 
 63 | 
 64 |     /*
 65 |      * If set, VIRTIO_F_RING_EVENT_IDX is negotiated for this queue and
 66 |      * avail/used_event fields must be used for notification.
 67 |      */
 68 |     bool has_event_idx;
 69 | 
 70 |     /*
 71 |      * eventfd for used buffers notification.
 72 |      * can be reset after virtq is started.
 73 |      */
 74 |     int notify_fd;
 75 | 
 76 |     /*
 77 |      * Whether the processing of this virtq is enabled.
 78 |      * Can be toggled after virtq is started.
 79 |      */
 80 |     bool enabled;
 81 | 
 82 |     /* inflight information */
 83 |     uint64_t req_cnt;
 84 |     struct inflight_split_region *inflight_region;
 85 |     bool inflight_check;
 86 | 
 87 |     /*
 88 |      * these objects are per-device but storing a link on virtqueue facilitates
 89 |      * bookkeeping
 90 |      */
 91 |     struct vhd_memory_map *mm;
 92 |     struct vhd_memory_log *log;
 93 | 
 94 |     /* Usage statistics */
 95 |     struct vq_stat {
 96 |         /* Metrics provided to users */
 97 |         struct vhd_vq_metrics metrics;
 98 | 
 99 |         /* Metrics service info fields. Not provided to uses */
100 |         /* timestamps for periodic metrics */
101 |         time_t period_start_ts;
102 |     } stat;
103 | };
104 | 
105 | void virtio_virtq_init(struct virtio_virtq *vq);
106 | 
107 | void virtio_virtq_release(struct virtio_virtq *vq);
108 | 
109 | bool virtq_is_broken(struct virtio_virtq *vq);
110 | 
111 | void mark_broken(struct virtio_virtq *vq);
112 | 
113 | typedef void(*virtq_handle_buffers_cb)(void *arg,
114 |                                        struct virtio_virtq *vq,
115 |                                        struct virtio_iov *iov);
116 | int virtq_dequeue_many(struct virtio_virtq *vq,
117 |                        virtq_handle_buffers_cb handle_buffers_cb,
118 |                        void *arg);
119 | 
120 | void virtq_push(struct virtio_virtq *vq, struct virtio_iov *iov, uint32_t len);
121 | 
122 | void virtq_set_notify_fd(struct virtio_virtq *vq, int fd);
123 | 
124 | void virtio_free_iov(struct virtio_iov *iov);
125 | uint16_t virtio_iov_get_head(struct virtio_iov *iov);
126 | 
127 | void virtio_virtq_get_stat(struct virtio_virtq *vq,
128 |                            struct vhd_vq_metrics *metrics);
129 | 
130 | void abort_request(struct virtio_virtq *vq, struct virtio_iov *iov);
131 | #ifdef __cplusplus
132 | }
133 | #endif
134 | 


--------------------------------------------------------------------------------
/virtio/virtio_fs.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | 
  3 | #include "vhost/fs.h"
  4 | 
  5 | #include "virtio_fs.h"
  6 | #include "virtio_fs_spec.h"
  7 | 
  8 | #include "bio.h"
  9 | #include "virt_queue.h"
 10 | #include "logging.h"
 11 | #include "server_internal.h"
 12 | #include "vdev.h"
 13 | 
 14 | /******************************************************************************/
 15 | 
 16 | struct virtio_fs_io {
 17 |     struct virtio_virtq *vq;
 18 |     struct virtio_iov *iov;
 19 | 
 20 |     struct vhd_io io;
 21 |     struct vhd_fs_io fs_io;
 22 | };
 23 | 
 24 | /******************************************************************************/
 25 | 
 26 | static void complete_request(struct vhd_io *io)
 27 | {
 28 |     struct virtio_fs_io *vbio = containerof(io, struct virtio_fs_io, io);
 29 |     struct virtio_iov *viov = vbio->iov;
 30 |     /* if IN iov has at least one buffer it accomodates fuse_out_header */
 31 |     struct virtio_fs_out_header *out =
 32 |                         viov->niov_in ? viov->iov_in[0].base : NULL;
 33 |     uint32_t len = out ? out->len : 0;
 34 | 
 35 |     if (likely(io->status != VHD_BDEV_CANCELED)) {
 36 |         virtq_push(vbio->vq, vbio->iov, len);
 37 |     }
 38 | 
 39 |     virtio_free_iov(viov);
 40 |     vhd_free(vbio);
 41 | }
 42 | 
 43 | static int virtio_fs_handle_request(struct virtio_virtq *vq,
 44 |                                     struct vhd_io *io)
 45 | {
 46 |     io->vring = VHD_VRING_FROM_VQ(vq);
 47 |     return vhd_enqueue_request(vhd_get_rq_for_vring(io->vring), io);
 48 | }
 49 | 
 50 | static void handle_buffers(void *arg, struct virtio_virtq *vq, struct virtio_iov *iov)
 51 | {
 52 |     uint16_t niov = iov->niov_in + iov->niov_out;
 53 |     (void)arg;
 54 | 
 55 |     /*
 56 |      * Assume legacy message framing without VIRTIO_F_ANY_LAYOUT:
 57 |      * - virtio IN / FUSE OUT segments, with the first one fully containing
 58 |      *   fuse_in_header
 59 |      * - virtio OUT / FUSE IN segments, with the first one fully containing
 60 |      *   fuse_out_header (except FUSE_FORGET and FUSE_BATCH_FORGET which have
 61 |      *   no response part at all)
 62 |      */
 63 | 
 64 |     struct virtio_fs_in_header *in;
 65 |     struct virtio_fs_out_header *out;
 66 | 
 67 |     if (iov->niov_in && iov->iov_in[0].len < sizeof(*out)) {
 68 |         VHD_LOG_ERROR("No room for response in the request");
 69 |         abort_request(vq, iov);
 70 |         return;
 71 |     }
 72 | 
 73 |     if (!iov->niov_out || iov->iov_out[0].len < sizeof(*in)) {
 74 |         VHD_LOG_ERROR("Malformed request header");
 75 |         abort_request(vq, iov);
 76 |         return;
 77 |     }
 78 | 
 79 |     in = iov->iov_out[0].base;
 80 |     out = iov->niov_in ? iov->iov_in[0].base : NULL;
 81 | 
 82 |     struct virtio_fs_io *bio = vhd_zalloc(sizeof(*bio));
 83 |     bio->vq = vq;
 84 |     bio->iov = iov;
 85 |     bio->io.completion_handler = complete_request;
 86 | 
 87 |     bio->fs_io.sglist.nbuffers = niov;
 88 |     bio->fs_io.sglist.buffers = iov->buffers;
 89 | 
 90 |     int res = virtio_fs_handle_request(bio->vq, &bio->io);
 91 |     if (res != 0) {
 92 |         VHD_LOG_ERROR("request submission failed with %d", res);
 93 | 
 94 |         if (out) {
 95 |             out->len = sizeof(*out);
 96 |             out->error = res;
 97 |             out->unique = in->unique;
 98 |         }
 99 | 
100 |         complete_request(&bio->io);
101 |         return;
102 |     }
103 | }
104 | 
105 | /******************************************************************************/
106 | 
107 | int virtio_fs_init_dev(
108 |     struct virtio_fs_dev *dev,
109 |     struct vhd_fsdev_info *fsdev)
110 | {
111 |     VHD_VERIFY(dev);
112 |     VHD_VERIFY(fsdev);
113 | 
114 |     dev->fsdev = fsdev;
115 | 
116 |     dev->config = (struct virtio_fs_config) {
117 |         .num_request_queues = fsdev->num_queues,
118 |     };
119 |     if (fsdev->tag) {
120 |         memcpy(dev->config.tag, fsdev->tag,
121 |                MIN(strlen(fsdev->tag), sizeof(dev->config.tag)));
122 |     }
123 | 
124 |     return 0;
125 | }
126 | 
127 | int virtio_fs_dispatch_requests(struct virtio_fs_dev *dev,
128 |                                 struct virtio_virtq *vq)
129 | {
130 |     VHD_VERIFY(dev);
131 |     VHD_VERIFY(vq);
132 | 
133 |     return virtq_dequeue_many(vq, handle_buffers, dev);
134 | }
135 | 
136 | struct vhd_fs_io *vhd_get_fs_io(struct vhd_io *io)
137 | {
138 |     struct virtio_fs_io *bio = containerof(io, struct virtio_fs_io, io);
139 |     return &bio->fs_io;
140 | }
141 | 


--------------------------------------------------------------------------------
/virtio/virtio_blk_spec.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * virtio blk protocol definitions according to virtio 1.0 spec
  3 |  */
  4 | 
  5 | #pragma once
  6 | 
  7 | #include "platform.h"
  8 | #include "virtio_types.h"
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | #define VIRTIO_BLK_SECTOR_SIZE      512
 15 | #define VIRTIO_BLK_SECTOR_SHIFT     9
 16 | #define VIRTIO_BLK_DISKID_LENGTH    20
 17 | #define VIRTIO_BLK_STATUS_LENGTH    1
 18 | 
 19 | /* Feature bits */
 20 | #define VIRTIO_BLK_F_SIZE_MAX   1   /* Maximum size of any single segment is in size_max. */
 21 | #define VIRTIO_BLK_F_SEG_MAX    2   /* Maximum number of segments in a request is in seg_max. */
 22 | #define VIRTIO_BLK_F_GEOMETRY   4   /* Disk-style geometry specified in geometry. */
 23 | #define VIRTIO_BLK_F_RO         5   /* Device is read-only. */
 24 | #define VIRTIO_BLK_F_BLK_SIZE   6   /* Block size of disk is in blk_size. */
 25 | #define VIRTIO_BLK_F_FLUSH      9   /* Cache flush command support. */
 26 | #define VIRTIO_BLK_F_TOPOLOGY   10  /* Device exports information on optimal I/O alignment. */
 27 | #define VIRTIO_BLK_F_CONFIG_WCE 11  /* Device can toggle its cache between writeback and writethrough modes. */
 28 | #define VIRTIO_BLK_F_DISCARD    13  /* Device can support discard command */
 29 | #define VIRTIO_BLK_F_WRITE_ZEROES 14  /* Device supports write-zeroes requests */
 30 | 
 31 | /* Custom extentions */
 32 | #define VIRTIO_BLK_F_MQ         12  /* Device reports maximum supported queues in numqueues config field */
 33 | 
 34 | /* Legacy interface: feature bits */
 35 | #define VIRTIO_BLK_F_BARRIER    0   /* Device supports request barriers. */
 36 | #define VIRTIO_BLK_F_SCSI       7   /* Device supports scsi packet commands. */
 37 | 
 38 | /*
 39 |  * Device configuration layout.
 40 |  * The capacity of the device (expressed in 512-byte sectors) is always present.
 41 |  * The availability of the others all depend on various feature bits as
 42 |  * indicated above.
 43 |  */
 44 | struct VHD_PACKED virtio_blk_config {
 45 |     le64 capacity;
 46 |     le32 size_max;
 47 |     le32 seg_max;
 48 |     struct VHD_PACKED virtio_blk_geometry {
 49 |         le16 cylinders;
 50 |         u8 heads;
 51 |         u8 sectors;
 52 |     } geometry;
 53 |     le32 blk_size;
 54 |     struct VHD_PACKED virtio_blk_topology {
 55 |         /* # of logical blocks per physical block (log2) */
 56 |         u8 physical_block_exp;
 57 |         /* offset of first aligned logical block */
 58 |         u8 alignment_offset;
 59 |         /* suggested minimum I/O size in blocks */
 60 |         le16 min_io_size;
 61 |         /* optimal (suggested maximum) I/O size in blocks */
 62 |         le32 opt_io_size;
 63 |     } topology;
 64 |     u8 writeback;
 65 |     u8 _reserved;
 66 |     le16 numqueues;
 67 | 
 68 |     /* VIRTIO_BLK_F_DISCARD-specific fields */
 69 |     le32 max_discard_sectors;
 70 |     le32 max_discard_seg;
 71 |     le32 discard_sector_alignment;
 72 | 
 73 |     /* VIRTIO_BLK_F_WRITE_ZEROES-specific fields */
 74 |     le32 max_write_zeroes_sectors;
 75 |     le32 max_write_zeroes_seg;
 76 |     u8 write_zeroes_may_unmap;
 77 |     u8 _reserved1[3];
 78 | };
 79 | 
 80 | /*
 81 |  * Device Operation
 82 |  * The driver queues requests to the virtqueue, and they are used by the device
 83 |  * (not necessarily in order).
 84 |  *
 85 |  * Request is a variable sized structure:
 86 |  * struct virtio_blk_req {
 87 |  *     le32 type;
 88 |  *     le32 reserved;
 89 |  *     le64 sector;
 90 |  *     u8 data[][512];
 91 |  *     u8 status;
 92 |  * };
 93 |  */
 94 | struct virtio_blk_req_hdr {
 95 | #define VIRTIO_BLK_T_IN         0   /* Device read */
 96 | #define VIRTIO_BLK_T_OUT        1   /* Device write */
 97 | #define VIRTIO_BLK_T_FLUSH      4   /* Flush */
 98 | #define VIRTIO_BLK_T_GET_ID     8   /* Get device id */
 99 | #define VIRTIO_BLK_T_DISCARD    11  /* Discard */
100 | #define VIRTIO_BLK_T_WRITE_ZEROES 13  /* Write zeroes */
101 |     le32 type;
102 |     le32 reserved;
103 |     le64 sector;
104 | };
105 | 
106 | struct virtio_blk_discard_write_zeroes {
107 |     le64 sector;
108 |     le32 num_sectors;
109 |     struct {
110 |         le32 unmap:1;
111 |         le32 reserved:31;
112 |     } flags;
113 | };
114 | 
115 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_req_hdr) == 16);
116 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_discard_write_zeroes) == 16);
117 | 
118 | #define VIRTIO_BLK_S_OK         0
119 | #define VIRTIO_BLK_S_IOERR      1
120 | #define VIRTIO_BLK_S_UNSUPP     2
121 | 
122 | #ifdef __cplusplus
123 | }
124 | #endif
125 | 


--------------------------------------------------------------------------------
/include/vhost/blockdev.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | #include <stddef.h>
  5 | #include "vhost/types.h"
  6 | 
  7 | #ifdef __cplusplus
  8 | extern "C" {
  9 | #endif
 10 | 
 11 | struct vhd_io;
 12 | struct vhd_request_queue;
 13 | struct vhd_vdev;
 14 | 
 15 | /*
 16 |  * vhd_bdev_io values are always expressed in these units for any
 17 |  * vhd_bdev_info->sector_size configuration.
 18 |  */
 19 | #define VHD_SECTOR_SHIFT    (9)
 20 | #define VHD_SECTOR_SIZE     (1ull << VHD_SECTOR_SHIFT)
 21 | 
 22 | #define VHD_BDEV_F_READONLY     (1ull << 0)
 23 | #define VHD_BDEV_F_DISCARD      (1ull << 1)
 24 | #define VHD_BDEV_F_WRITE_ZEROES (1ull << 2)
 25 | 
 26 | /**
 27 |  * Client-supplied block device backend definition
 28 |  */
 29 | struct vhd_bdev_info {
 30 |     /* Blockdev serial */
 31 |     const char *serial;
 32 | 
 33 |     /* Path to create listen sockets */
 34 |     const char *socket_path;
 35 | 
 36 |     /*
 37 |      * Physical block size in bytes, must be a multiple of sector_size
 38 |      * or of VHD_SECTOR_SIZE if sector_size is 0.
 39 |      */
 40 |     uint32_t block_size;
 41 | 
 42 |     /*
 43 |      * Logical sector size in bytes, VHD_SECTOR_SIZE is used if
 44 |      * this value is set to 0.
 45 |      *
 46 |      * Note that the virtio specification technically provides this value as
 47 |      * a suggestion to the guest. Thus, a 4096-byte sector size disk may still
 48 |      * generate 512-byte requests. Technically all existing software treats
 49 |      * this value as a logical sector size, but care must still be taken.
 50 |      */
 51 |     uint32_t sector_size;
 52 | 
 53 |     /* Optimal io size in bytes */
 54 |     uint32_t optimal_io_size;
 55 | 
 56 |     /* Total number of backend queues this device supports */
 57 |     uint32_t num_queues;
 58 | 
 59 |     /* Device size in blocks */
 60 |     uint64_t total_blocks;
 61 | 
 62 |     /* Supported VHD_BDEV_F_* features */
 63 |     uint64_t features;
 64 | 
 65 |     /* Gets called after mapping guest memory region */
 66 |     int (*map_cb)(void *addr, size_t len);
 67 | 
 68 |     /* Gets called before unmapping guest memory region */
 69 |     int (*unmap_cb)(void *addr, size_t len);
 70 | 
 71 |     /*
 72 |      * If set to a non-zero value, PTEs backing the guest memory regions
 73 |      * for this blockdev are flushed (unmapped and mapped back) every
 74 |      * N bytes processed by the backend. E.g. if this value is 1024, PTEs
 75 |      * will be flushed after the guest reads/writes 2 blocks.
 76 |      */
 77 |     size_t pte_flush_byte_threshold;
 78 | };
 79 | 
 80 | static inline bool vhd_blockdev_is_readonly(const struct vhd_bdev_info *bdev)
 81 | {
 82 |     return bdev->features & VHD_BDEV_F_READONLY;
 83 | }
 84 | 
 85 | static inline bool vhd_blockdev_has_discard(const struct vhd_bdev_info *bdev)
 86 | {
 87 |     return bdev->features & VHD_BDEV_F_DISCARD;
 88 | }
 89 | 
 90 | static inline bool vhd_blockdev_has_write_zeroes(
 91 |         const struct vhd_bdev_info *bdev)
 92 | {
 93 |     return bdev->features & VHD_BDEV_F_WRITE_ZEROES;
 94 | }
 95 | 
 96 | static inline uint32_t vhd_blockdev_sector_size(
 97 |     const struct vhd_bdev_info *bdev)
 98 | {
 99 |     return bdev->sector_size ? bdev->sector_size : VHD_SECTOR_SIZE;
100 | }
101 | 
102 | /**
103 |  * Block io request type
104 |  */
105 | enum vhd_bdev_io_type {
106 |     VHD_BDEV_READ,
107 |     VHD_BDEV_WRITE,
108 |     VHD_BDEV_DISCARD,
109 |     VHD_BDEV_WRITE_ZEROES,
110 | };
111 | 
112 | /**
113 |  * In-flight blockdev io request
114 |  */
115 | struct vhd_bdev_io {
116 |     enum vhd_bdev_io_type type;
117 | 
118 |     /*
119 |      * These values are ALWAYS expressed in VHD_SECTOR_SIZE (aka 512-byte)
120 |      * units, even if this device has a larger sector_size.
121 |      */
122 |     uint64_t first_sector;
123 |     uint64_t total_sectors;
124 | 
125 |     struct vhd_sglist sglist;
126 | };
127 | 
128 | struct vhd_bdev_io *vhd_get_bdev_io(struct vhd_io *io);
129 | 
130 | /**
131 |  * Register a vhost block device.
132 |  *
133 |  * After registering a device, it will be accessible to clients through a vhost
134 |  * socket.
135 |  * All requests are submitted to attacher request queues for caller to process.
136 |  *
137 |  * @bdev        Caller block device info. The structure is used only for
138 |  *              initialization and may be freed by caller after
139 |  *              vhd_register_blockdev() returns.
140 |  * @rqs         An array of request queues to use for dispatching device I/O
141 |  *              requests.
142 |  * @num_rqs     Number of request queues in the @rqs array.
143 |  * @priv        Caller private data to associate with resulting vdev.
144 |  */
145 | struct vhd_vdev *vhd_register_blockdev(const struct vhd_bdev_info *bdev,
146 |                                        struct vhd_request_queue **rqs,
147 |                                        int num_rqs, void *priv);
148 | 
149 | /**
150 |  * Unregister a vhost block device.
151 |  */
152 | void vhd_unregister_blockdev(struct vhd_vdev *vdev,
153 |                              void (*unregister_complete)(void *), void *arg);
154 | 
155 | /**
156 |  * Resize a vhost block device.
157 |  *
158 |  * The function change virtio config, that client may read by
159 |  * VHOST_USER_GET_CONFIG command.
160 |  *
161 |  * Note, that client is not notified about config change, the caller is
162 |  * responsible for this.
163 |  */
164 | void vhd_blockdev_set_total_blocks(struct vhd_vdev *vdev,
165 |                                    uint64_t total_blocks);
166 | 
167 | #ifdef __cplusplus
168 | }
169 | #endif
170 | 


--------------------------------------------------------------------------------
/blockdev.c:
--------------------------------------------------------------------------------
  1 | #include <inttypes.h>
  2 | #include <stdint.h>
  3 | 
  4 | #include "vhost/blockdev.h"
  5 | #include "server_internal.h"
  6 | #include "vdev.h"
  7 | #include "logging.h"
  8 | 
  9 | #include "bio.h"
 10 | #include "virtio/virtio_blk.h"
 11 | 
 12 | struct vhd_bdev {
 13 |     /* Base vdev */
 14 |     struct vhd_vdev vdev;
 15 | 
 16 |     /* VM-facing interface type */
 17 |     struct virtio_blk_dev vblk;
 18 | 
 19 |     LIST_ENTRY(vhd_bdev) blockdevs;
 20 | };
 21 | 
 22 | static LIST_HEAD(, vhd_bdev) g_bdev_list = LIST_HEAD_INITIALIZER(g_bdev_list);
 23 | 
 24 | #define VHD_BLOCKDEV_FROM_VDEV(ptr) containerof(ptr, struct vhd_bdev, vdev)
 25 | 
 26 | /*////////////////////////////////////////////////////////////////////////////*/
 27 | 
 28 | static uint64_t vblk_get_features(struct vhd_vdev *vdev)
 29 | {
 30 |     struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev);
 31 |     return virtio_blk_get_features(&dev->vblk);
 32 | }
 33 | 
 34 | static int vblk_set_features(struct vhd_vdev *vdev, uint64_t features)
 35 | {
 36 |     return 0;
 37 | }
 38 | 
 39 | /* vhost_get_config assumes that config is less than VHOST_USER_CONFIG_SPACE_MAX */
 40 | VHD_STATIC_ASSERT(sizeof(struct virtio_blk_config) <= VHOST_USER_CONFIG_SPACE_MAX);
 41 | 
 42 | static size_t vblk_get_config(struct vhd_vdev *vdev, void *cfgbuf,
 43 |                               size_t bufsize, size_t offset)
 44 | {
 45 |     struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev);
 46 | 
 47 |     return virtio_blk_get_config(&dev->vblk, cfgbuf, bufsize, offset);
 48 | }
 49 | 
 50 | static int vblk_dispatch(struct vhd_vdev *vdev, struct vhd_vring *vring)
 51 | {
 52 |     struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(vdev);
 53 |     return virtio_blk_dispatch_requests(&dev->vblk, &vring->vq);
 54 | }
 55 | 
 56 | static void vblk_free(struct vhd_vdev *vdev)
 57 | {
 58 |     struct vhd_bdev *bdev = VHD_BLOCKDEV_FROM_VDEV(vdev);
 59 | 
 60 |     LIST_REMOVE(bdev, blockdevs);
 61 |     virtio_blk_destroy_dev(&bdev->vblk);
 62 |     vhd_free(bdev);
 63 | }
 64 | 
 65 | static const struct vhd_vdev_type g_virtio_blk_vdev_type = {
 66 |     .desc               = "virtio-blk",
 67 |     .get_features       = vblk_get_features,
 68 |     .set_features       = vblk_set_features,
 69 |     .get_config         = vblk_get_config,
 70 |     .dispatch_requests  = vblk_dispatch,
 71 |     .free               = vblk_free,
 72 | };
 73 | 
 74 | struct set_total_blocks {
 75 |     struct vhd_vdev *vdev;
 76 |     uint64_t total_blocks;
 77 | };
 78 | 
 79 | static void set_total_blocks_entry(struct vhd_work *work, void *opaque)
 80 | {
 81 |     struct set_total_blocks *stb = opaque;
 82 |     struct vhd_bdev *dev = VHD_BLOCKDEV_FROM_VDEV(stb->vdev);
 83 | 
 84 |     virtio_blk_set_total_blocks(&dev->vblk, stb->total_blocks);
 85 |     vhd_complete_work(work, 0);
 86 | }
 87 | 
 88 | void vhd_blockdev_set_total_blocks(struct vhd_vdev *vdev, uint64_t total_blocks)
 89 | {
 90 |     struct set_total_blocks stb = {
 91 |         .vdev = vdev,
 92 |         .total_blocks = total_blocks,
 93 |     };
 94 | 
 95 |     VHD_OBJ_INFO(vdev, "Set total blocks %" PRIu64, total_blocks);
 96 | 
 97 |     /*
 98 |      * Modify virtio config in g_vhost_evloop, to not interfere with .get_config
 99 |      *
100 |      * We don't need vdev_submit_work_and_wait() logic here, as setting
101 |      * total_blocks in config is unrelated stopping process, so it should not be
102 |      * a problem intersect with wdev_stop_work work.
103 |      */
104 |     int ret = vhd_submit_ctl_work_and_wait(set_total_blocks_entry, &stb);
105 |     VHD_VERIFY(ret == 0);
106 | }
107 | 
108 | static bool blockdev_validate_features(const struct vhd_bdev_info *bdev)
109 | {
110 |     const uint64_t valid_features = VHD_BDEV_F_READONLY |
111 |                                     VHD_BDEV_F_DISCARD |
112 |                                     VHD_BDEV_F_WRITE_ZEROES;
113 |     return (bdev->features & valid_features) == bdev->features;
114 | }
115 | 
116 | struct vhd_vdev *vhd_register_blockdev(const struct vhd_bdev_info *bdev,
117 |                                        struct vhd_request_queue **rqs,
118 |                                        int num_rqs, void *priv)
119 | {
120 |     int res;
121 |     uint32_t sector_size;
122 | 
123 |     if (!bdev->total_blocks || !bdev->block_size) {
124 |         VHD_LOG_ERROR("Zero blockdev capacity %" PRIu64 " * %" PRIu32,
125 |                       bdev->total_blocks, bdev->block_size);
126 |         return NULL;
127 |     }
128 | 
129 |     sector_size = vhd_blockdev_sector_size(bdev);
130 | 
131 |     if (sector_size & (sector_size - 1) || sector_size % VHD_SECTOR_SIZE) {
132 |         VHD_LOG_ERROR("Invalid sector size %" PRIu32 " must be a power "
133 |                       "of two and multiple of %llu", sector_size,
134 |                       VHD_SECTOR_SIZE);
135 |         return NULL;
136 |     }
137 | 
138 |     if ((bdev->block_size & (bdev->block_size - 1)) ||
139 |         bdev->block_size % sector_size) {
140 |         VHD_LOG_ERROR("Block size %" PRIu32 " is not"
141 |                       " a power of two multiple of sector size (%" PRIu32 ")",
142 |                       bdev->block_size, sector_size);
143 |         return NULL;
144 |     }
145 | 
146 |     if (bdev->optimal_io_size % bdev->block_size) {
147 |         VHD_LOG_ERROR("Optimal io size %" PRIu32 " is not"
148 |                       " a multiple of block size (%" PRIu32 ")",
149 |                       bdev->optimal_io_size, bdev->block_size);
150 |         return NULL;
151 |     }
152 | 
153 |     if (bdev->total_blocks > (UINT64_MAX / bdev->block_size)) {
154 |         VHD_LOG_ERROR("Disk capacity %" PRIu64 " is too large!",
155 |                       bdev->total_blocks);
156 |         return NULL;
157 |     }
158 | 
159 |     if (!blockdev_validate_features(bdev)) {
160 |         VHD_LOG_ERROR("Invalid blockdev features %" PRIu64, bdev->features);
161 |         return NULL;
162 |     }
163 | 
164 |     struct vhd_bdev *dev = vhd_zalloc(sizeof(*dev));
165 | 
166 |     virtio_blk_init_dev(&dev->vblk, bdev);
167 | 
168 |     res = vhd_vdev_init_server(&dev->vdev, bdev->socket_path,
169 |                                &g_virtio_blk_vdev_type,
170 |                                bdev->num_queues, rqs, num_rqs, priv,
171 |                                bdev->map_cb, bdev->unmap_cb,
172 |                                bdev->pte_flush_byte_threshold);
173 |     if (res != 0) {
174 |         goto error_out;
175 |     }
176 | 
177 |     LIST_INSERT_HEAD(&g_bdev_list, dev, blockdevs);
178 |     return &dev->vdev;
179 | 
180 | error_out:
181 |     virtio_blk_destroy_dev(&dev->vblk);
182 |     vhd_free(dev);
183 |     return NULL;
184 | }
185 | 
186 | void vhd_unregister_blockdev(struct vhd_vdev *vdev,
187 |                              void (*unregister_complete)(void *), void *arg)
188 | {
189 |     vhd_vdev_stop_server(vdev, unregister_complete, arg);
190 | }
191 | 


--------------------------------------------------------------------------------
/vhost_spec.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * vhost-user protocol definitions
  3 |  */
  4 | 
  5 | #pragma once
  6 | 
  7 | #include <stdint.h>
  8 | 
  9 | #ifdef __cplusplus
 10 | extern "C" {
 11 | #endif
 12 | 
 13 | /*
 14 |  * Define protocol structures and definitions based on the vhost user
 15 |  * protocol specification:
 16 |  * https://github.com/qemu/qemu/blob/master/docs/interop/vhost-user.txt
 17 |  */
 18 | 
 19 | /* Vhost user protocol flags. */
 20 | /* This is a vhost protocol version. */
 21 | #define VHOST_USER_VERSION_MASK        0x3
 22 | #define VHOST_USER_MSG_VERSION         0x1
 23 | #define VHOST_USER_MSG_FLAGS_REPLY     ((1 << 2) | VHOST_USER_MSG_VERSION)
 24 | #define VHOST_USER_MSG_FLAGS_REPLY_ACK (1 << 3)
 25 | 
 26 | /*
 27 |  * Vhost user protocol features (GET_PROTOCOL_FEATURES and
 28 |  * SET_PROTOCOL_FEATURES commands).
 29 |  */
 30 | #define VHOST_USER_PROTOCOL_F_MQ             0
 31 | #define VHOST_USER_PROTOCOL_F_LOG_SHMFD      1
 32 | #define VHOST_USER_PROTOCOL_F_RARP           2
 33 | #define VHOST_USER_PROTOCOL_F_REPLY_ACK      3
 34 | #define VHOST_USER_PROTOCOL_F_MTU            4
 35 | #define VHOST_USER_PROTOCOL_F_SLAVE_REQ      5
 36 | #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN   6
 37 | #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
 38 | #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
 39 | #define VHOST_USER_PROTOCOL_F_CONFIG         9
 40 | #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
 41 | #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15
 42 | 
 43 | /* Vhost user features (GET_FEATURES and SET_FEATURES commands). */
 44 | #define VHOST_F_LOG_ALL                     26
 45 | #define VHOST_USER_F_PROTOCOL_FEATURES      30
 46 | #define VIRTIO_F_RING_INDIRECT_DESC         28
 47 | #define VIRTIO_F_RING_EVENT_IDX             29
 48 | #define VIRTIO_F_VERSION_1                  32
 49 | 
 50 | /*
 51 |  * Invalid FD bit for the VHOST_USER_SET_VRING_KICK and
 52 |  * VHOST_USER_SET_VRING_CALL commands. If this bit is
 53 |  * set then the communication is forced to use polling
 54 |  * instead of using for a kick.
 55 |  */
 56 | #define VHOST_VRING_INVALID_FD  (1 << 8)
 57 | #define VHOST_VRING_IDX_MASK    0xff
 58 | 
 59 | /* Maximum size of the device PCI configuration space. */
 60 | #define VHOST_USER_CONFIG_SPACE_MAX 256
 61 | 
 62 | /*
 63 |  * According to the protocol specification this is the maximum number
 64 |  * of memory regions sent in one command. Also it is the maximum number
 65 |  * of file descriptors.
 66 |  */
 67 | #define VHOST_USER_MEM_REGIONS_MAX 8
 68 | #define VHOST_USER_MAX_FDS VHOST_USER_MEM_REGIONS_MAX
 69 | 
 70 | /* Define request types. */
 71 | enum {
 72 |     VHOST_USER_NONE = 0,
 73 |     VHOST_USER_GET_FEATURES = 1,
 74 |     VHOST_USER_SET_FEATURES = 2,
 75 |     VHOST_USER_SET_OWNER = 3,
 76 |     VHOST_USER_RESET_OWNER = 4,
 77 |     VHOST_USER_SET_MEM_TABLE = 5,
 78 |     VHOST_USER_SET_LOG_BASE = 6,
 79 |     VHOST_USER_SET_LOG_FD = 7,
 80 |     VHOST_USER_SET_VRING_NUM = 8,
 81 |     VHOST_USER_SET_VRING_ADDR = 9,
 82 |     VHOST_USER_SET_VRING_BASE = 10,
 83 |     VHOST_USER_GET_VRING_BASE = 11,
 84 |     VHOST_USER_SET_VRING_KICK = 12,
 85 |     VHOST_USER_SET_VRING_CALL = 13,
 86 |     VHOST_USER_SET_VRING_ERR = 14,
 87 |     VHOST_USER_GET_PROTOCOL_FEATURES = 15,
 88 |     VHOST_USER_SET_PROTOCOL_FEATURES = 16,
 89 |     VHOST_USER_GET_QUEUE_NUM = 17,
 90 |     VHOST_USER_SET_VRING_ENABLE = 18,
 91 |     VHOST_USER_SEND_RARP = 19,
 92 |     VHOST_USER_NET_SET_MTU = 20,
 93 |     VHOST_USER_SET_SLAVE_REQ_FD = 21,
 94 |     VHOST_USER_IOTLB_MSG = 22,
 95 |     VHOST_USER_SET_VRING_ENDIAN = 23,
 96 |     VHOST_USER_GET_CONFIG = 24,
 97 |     VHOST_USER_SET_CONFIG = 25,
 98 |     VHOST_USER_CREATE_CRYPTO_SESSION = 26,
 99 |     VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
100 |     VHOST_USER_POSTCOPY_ADVISE = 28,
101 |     VHOST_USER_POSTCOPY_LISTEN = 29,
102 |     VHOST_USER_POSTCOPY_END = 30,
103 |     VHOST_USER_GET_INFLIGHT_FD = 31,
104 |     VHOST_USER_SET_INFLIGHT_FD = 32,
105 |     VHOST_USER_GET_MAX_MEM_SLOTS = 36,
106 |     VHOST_USER_ADD_MEM_REG = 37,
107 |     VHOST_USER_REM_MEM_REG = 38,
108 | };
109 | 
110 | struct vhost_user_mem_region {
111 |     uint64_t guest_addr;
112 |     uint64_t size;
113 |     uint64_t user_addr;
114 |     uint64_t mmap_offset;
115 | };
116 | 
117 | struct vhost_user_mem_single_mem_desc {
118 |     uint64_t _padding;
119 |     struct vhost_user_mem_region region;
120 | };
121 | 
122 | struct vhost_user_mem_desc {
123 |     uint32_t nregions;
124 |     uint32_t _padding;
125 |     struct vhost_user_mem_region regions[VHOST_USER_MEM_REGIONS_MAX];
126 | };
127 | 
128 | struct vhost_user_vring_state {
129 |     uint32_t index;
130 |     uint32_t num;
131 | };
132 | 
133 | struct vhost_user_vring_addr {
134 |     uint32_t index;
135 | #define VHOST_VRING_F_LOG (1 << 0)
136 |     uint32_t flags;
137 |     uint64_t desc_addr;
138 |     uint64_t used_addr;
139 |     uint64_t avail_addr;
140 |     uint64_t used_gpa_base;
141 | };
142 | 
143 | struct vhost_user_config_space {
144 |     uint32_t offset;
145 |     uint32_t size;
146 |     uint32_t flags;
147 |     uint8_t payload[VHOST_USER_CONFIG_SPACE_MAX];
148 | };
149 | #define VHOST_CONFIG_HDR_SIZE (offsetof(struct vhost_user_config_space, payload))
150 | 
151 | struct vhost_user_inflight_desc {
152 |     uint64_t mmap_size;
153 |     uint64_t mmap_offset;
154 |     uint16_t num_queues;
155 |     uint16_t queue_size;
156 | };
157 | 
158 | struct inflight_split_desc {
159 |     uint8_t inflight;
160 |     uint8_t padding[5];
161 |     uint16_t next;
162 |     uint64_t counter;
163 | };
164 | 
165 | struct inflight_split_region {
166 |     uint64_t features;
167 |     uint16_t version;
168 |     uint16_t desc_num;
169 |     uint16_t last_batch_head;
170 |     uint16_t used_idx;
171 |     struct inflight_split_desc desc[];
172 | };
173 | 
174 | struct vhost_user_log {
175 |     uint64_t size;
176 |     uint64_t offset;
177 | };
178 | 
179 | struct vhost_user_msg_hdr {
180 |     uint32_t req;
181 |     uint32_t flags;
182 |     uint32_t size;
183 | };
184 | 
185 | union vhost_user_msg_payload {
186 |     /*
187 |      * VHOST_USER_GET_QUEUE_NUM, VHOST_USER_GET_PROTOCOL_FEATURES,
188 |      * VHOST_USER_GET_FEATURES,
189 |      * VHOST_USER_SET_VRING_KICK, VHOST_USER_SET_VRING_CALL
190 |      */
191 |     uint64_t u64;
192 |     /* VHOST_USER_GET_CONFIG, VHOST_USER_SET_CONFIG */
193 |     struct vhost_user_config_space config;
194 |     /* VHOST_USER_SET_MEM_TABLE */
195 |     struct vhost_user_mem_desc mem_desc;
196 |     /*
197 |      * VHOST_USER_GET_VRING_BASE, VHOST_USER_SET_VRING_BASE,
198 |      * VHOST_USER_SET_VRING_NUM
199 |      */
200 |     struct vhost_user_vring_state vring_state;
201 |     /* VHOST_USER_SET_VRING_ADDR */
202 |     struct vhost_user_vring_addr vring_addr;
203 |     /* VHOST_USER_GET_INFLIGHT_FD, VHOST_USER_SET_INFLIGHT_FD */
204 |     struct vhost_user_inflight_desc inflight_desc;
205 |     /* VHOST_USER_SET_LOG_BASE */
206 |     struct vhost_user_log log;
207 | };
208 | 
209 | #ifdef __cplusplus
210 | }
211 | #endif
212 | 


--------------------------------------------------------------------------------
/tests/test_libvhost.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import os
  3 | import shutil
  4 | import signal
  5 | import time
  6 | import pytest
  7 | from typing import Tuple, List, Generator
  8 | 
  9 | 
 10 | # 1 GiB should be enough
 11 | DISK_IMAGE_SIZE = 1024 * 1024 * 1024
 12 | WORK_DIR = "work"
 13 | TEST_SERVER_BINARY_ENV_PATH = "TEST_SERVER_BINARY"
 14 | BLKIO_BENCH_ENV_PATH = "BLKIO_BENCH_BINARY"
 15 | 
 16 | 
 17 | def base_dir_abs_path() -> str:
 18 |     return os.path.dirname(os.path.abspath(__file__))
 19 | 
 20 | 
 21 | def build_dir() -> str:
 22 |     return os.path.join(base_dir_abs_path(), os.pardir, "build")
 23 | 
 24 | 
 25 | @pytest.fixture(scope="session")
 26 | def blkio_bench() -> str:
 27 |     env_path = os.environ.get(BLKIO_BENCH_ENV_PATH)
 28 |     if env_path and os.path.exists(env_path):
 29 |         return env_path
 30 | 
 31 |     blkio_bench_path = os.path.join(
 32 |         build_dir(), "subprojects", "libblkio", "examples", "blkio-bench"
 33 |     )
 34 |     if os.path.exists(blkio_bench_path):
 35 |         return blkio_bench_path
 36 | 
 37 |     raise RuntimeError("This test requires blkio-bench example program "
 38 |                        "which comes with libblkio")
 39 | 
 40 | 
 41 | @pytest.fixture(scope="session")
 42 | def vhost_user_test_server() -> str:
 43 |     env_path = os.environ.get(TEST_SERVER_BINARY_ENV_PATH)
 44 |     if env_path and os.path.exists(env_path):
 45 |         return env_path
 46 | 
 47 |     server_path = os.path.join(
 48 |         build_dir(), "tests", "vhost-user-blk-test-server"
 49 |     )
 50 |     if os.path.exists(server_path):
 51 |         return server_path
 52 | 
 53 |     raise RuntimeError("A valid path to the test server must be specified "
 54 |                        f"in the {TEST_SERVER_BINARY_ENV_PATH} variable")
 55 | 
 56 | 
 57 | @pytest.fixture(scope="session")
 58 | def work_dir() -> Generator[str, None, None]:
 59 |     work_dir_path = os.path.join(base_dir_abs_path(), WORK_DIR)
 60 | 
 61 |     os.makedirs(work_dir_path, exist_ok=True)
 62 |     yield work_dir_path
 63 |     shutil.rmtree(work_dir_path)
 64 | 
 65 | 
 66 | @pytest.fixture(scope="session")
 67 | def disk_image(work_dir: str) -> Generator[str, None, None]:
 68 |     disk_image_path = os.path.join(work_dir, "disk-image.raw")
 69 | 
 70 |     with open(disk_image_path, "wb+") as f:
 71 |         f.seek(DISK_IMAGE_SIZE - 1)
 72 |         f.write(bytearray(1))
 73 | 
 74 |     yield disk_image_path
 75 |     os.remove(disk_image_path)
 76 | 
 77 | 
 78 | def create_server(
 79 |     work_dir: str, disk_image: str, vhost_user_test_server: str,
 80 |     pte_flush_threshold: int = 0, sector_size: int = 4096,
 81 |     block_size: int = 4096
 82 | ) -> Generator[str, None, None]:
 83 |     socket_path = os.path.join(work_dir, "server.sock")
 84 | 
 85 |     process = subprocess.Popen([
 86 |         vhost_user_test_server, "--disk",
 87 |         f"socket-path={socket_path},blk-file={disk_image}"
 88 |         f",serial=helloworld,pte-flush-threshold={pte_flush_threshold}"
 89 |         f",sector-size={sector_size},block-size={block_size}"
 90 |     ])
 91 | 
 92 |     retry = 0
 93 |     retry_limit = 5
 94 | 
 95 |     while True:
 96 |         if os.path.exists(socket_path):
 97 |             break
 98 | 
 99 |         if retry < retry_limit:
100 |             retry += 1
101 |             time.sleep(10)
102 |         else:
103 |             raise RuntimeError("Failed to start test server!")
104 | 
105 |     yield socket_path
106 | 
107 |     process.send_signal(signal.SIGINT)
108 |     process.wait(10)
109 | 
110 | 
111 | @pytest.fixture(scope="class")
112 | def server_socket(
113 |     work_dir: str, disk_image: str, vhost_user_test_server: str
114 | ) -> Generator[str, None, None]:
115 |     yield from create_server(work_dir, disk_image, vhost_user_test_server)
116 | 
117 | 
118 | @pytest.fixture(scope="class")
119 | def server_socket_with_pte_flush(
120 |     request: pytest.FixtureRequest, work_dir: str, disk_image: str,
121 |     vhost_user_test_server: str
122 | ) -> Generator[str, None, None]:
123 |     yield from create_server(work_dir, disk_image, vhost_user_test_server,
124 |                              request.param)
125 | 
126 | 
127 | @pytest.fixture(scope="class")
128 | def server_socket_with_custom_sector_size(
129 |     request: pytest.FixtureRequest, work_dir: str, disk_image: str,
130 |     vhost_user_test_server: str
131 | ) -> Generator[str, None, None]:
132 |     yield from create_server(work_dir, disk_image, vhost_user_test_server,
133 |                              0, *request.param)
134 | 
135 | 
136 | def pretty_print_blkio_config(param: List[str]) -> str:
137 |     return f"{param[0]}, blocksize={param[1]}"
138 | 
139 | 
140 | def check_run_blkio_bench(
141 |     path: str, type: str, blocksize: int, time: int, socket: str,
142 |     threads: int = 1
143 | ) -> None:
144 |     subprocess.check_call([
145 |         path, f"--blocksize={blocksize}", f"--runtime={time}",
146 |         f"--readwrite={type}", f"--num-threads={threads}",
147 |         "virtio-blk-vhost-user", f"path={socket}"
148 |     ], timeout=time + 10)
149 | 
150 | 
151 | class TestBasic:
152 |     @pytest.mark.parametrize(
153 |         'config',
154 |         [
155 |             ["read", 1024 * 1024],
156 |             ["write", 1024 * 1024],
157 |             ["randread", 4096],
158 |             ["randwrite", 4096],
159 |         ],
160 |         ids=pretty_print_blkio_config
161 |     )
162 |     def test_basic_operations(
163 |         self, server_socket: str, blkio_bench: str, config: Tuple[str, int]
164 |     ) -> None:
165 |         check_run_blkio_bench(blkio_bench, *config, 30, server_socket)
166 | 
167 | 
168 | @pytest.mark.parametrize(
169 |     'server_socket_with_pte_flush, time',
170 |     [
171 |         # Flush every 1 byte processed for 5 seconds
172 |         [1, 5],
173 |         # Flush every 50MiB processed for 30 seconds
174 |         [50 * 1024 * 1024, 30],
175 |     ],
176 |     indirect=['server_socket_with_pte_flush']
177 | )
178 | class TestPTEFlush:
179 |     def test_pte_flush(
180 |         self, server_socket_with_pte_flush: str, time: int,
181 |         blkio_bench: str
182 |     ) -> None:
183 |         check_run_blkio_bench(blkio_bench, "randread", 4096, time,
184 |                               server_socket_with_pte_flush)
185 | 
186 | 
187 | @pytest.mark.parametrize(
188 |     'server_socket_with_custom_sector_size, block_size',
189 |     [
190 |         # Test different sector sizes
191 |         # Format is: [[sector_size, block_size], blkio_bench_block_size]
192 |         [[1024, 2048], 2048],
193 |         [[4096, 4096], 4096],
194 |         [[2048, 8192], 8192],
195 |     ],
196 |     indirect=['server_socket_with_custom_sector_size']
197 | )
198 | class TestSectorSizes:
199 |     def test_sector_sizes(
200 |         self, server_socket_with_custom_sector_size: str, block_size: int,
201 |         blkio_bench: str
202 |     ) -> None:
203 |         check_run_blkio_bench(blkio_bench, "randread", block_size, 1,
204 |                               server_socket_with_custom_sector_size)
205 | 


--------------------------------------------------------------------------------
/platform.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdlib.h>
  4 | #include <stdint.h>
  5 | #include <stddef.h>
  6 | #include <stdbool.h>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <assert.h>
 10 | #include <errno.h>
 11 | #include <unistd.h>
 12 | #include <stdarg.h>
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif
 17 | 
 18 | #ifdef __has_feature
 19 | #   define VHD_HAS_FEATURE(x) __has_feature(x)
 20 | #else
 21 | #   define VHD_HAS_FEATURE(x) 0
 22 | #endif
 23 | 
 24 | #define HUGE_PAGE_SIZE 0x40000000 // 1G, works also for 2M pages alignment
 25 | 
 26 | /*////////////////////////////////////////////////////////////////////////////*/
 27 | 
 28 | #if !defined(NDEBUG)
 29 | #   define VHD_DEBUG
 30 | #endif
 31 | 
 32 | /*////////////////////////////////////////////////////////////////////////////*/
 33 | 
 34 | #if !defined(containerof)
 35 | #   define containerof(ptr, type, member) \
 36 |     ((type *) ((char *)(ptr) - offsetof(type, member)))
 37 | #endif
 38 | 
 39 | #if !defined(countof)
 40 | #   define countof(a) (sizeof(a) / sizeof(*a))
 41 | #endif
 42 | 
 43 | #ifndef likely
 44 | #define likely(x) __builtin_expect(!!(x), 1)
 45 | #define unlikely(x) __builtin_expect(!!(x), 0)
 46 | #endif
 47 | 
 48 | #ifdef __cplusplus
 49 | #   define VHD_STATIC_ASSERT(pred) static_assert((pred), __STRINGIFY(pred))
 50 | #elif (__STDC_VERSION__ >= 201112L)
 51 | #   define VHD_STATIC_ASSERT(pred)  _Static_assert((pred), __STRINGIFY(pred))
 52 | #else
 53 | #   error Implement me
 54 | #endif
 55 | 
 56 | /* TODO: compiler-specifics for non-gcc? */
 57 | #ifdef __GNUC__
 58 | #   define __STRINGIFY(x)           #x
 59 | #   define VHD_NORETURN             __attribute__((noreturn))
 60 | #   define VHD_TYPEOF               __typeof
 61 | #   define VHD_PACKED               __attribute__((packed))
 62 | 
 63 | #define VHD_ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 64 | 
 65 | /* Return 0-based index of first least significant bit set in 32-bit value */
 66 | static inline int vhd_find_first_bit32(uint32_t val)
 67 | {
 68 |     VHD_STATIC_ASSERT(sizeof(val) == sizeof(int));
 69 |     return __builtin_ctz(val);
 70 | }
 71 | 
 72 | /* Return 0-based index of first least significant bit set in 64-bit value */
 73 | static inline int vhd_find_first_bit64(uint64_t val)
 74 | {
 75 |     VHD_STATIC_ASSERT(sizeof(val) == sizeof(long long));
 76 |     return __builtin_ctzll(val);
 77 | }
 78 | 
 79 | #else
 80 | #   error Implement me
 81 | #endif
 82 | 
 83 | /*
 84 |  * MIN/MAX implementations with intuitive behavior:
 85 |  * - type safety
 86 |  * - exactly-once evaluation of both arguments
 87 |  * Note: unsuitable in constant expressions
 88 |  */
 89 | #define __safe_cmp(a, op, b)                             \
 90 |     ({                                                  \
 91 |         typeof(1 ? (a) : (b)) _a = (a), _b = (b);       \
 92 |         _a op _b ? _a : _b;                              \
 93 |     })
 94 | 
 95 | #undef MIN
 96 | #define MIN(a, b)       __safe_cmp(a, <, b)
 97 | #undef MAX
 98 | #define MAX(a, b)       __safe_cmp(a, >, b)
 99 | 
100 | /*////////////////////////////////////////////////////////////////////////////*/
101 | 
102 | static inline void VHD_NORETURN _vhd_verify_helper(
103 |     const char *what,
104 |     const char *file,
105 |     unsigned long line)
106 | {
107 |     /* TODO: smarter logging */
108 |     fprintf(stderr, "Verify failed: \"%s\" at %s:%lu\n", what, file, line);
109 |     abort();
110 | }
111 | 
112 | #define VHD_ASSERT(cond) assert(cond)
113 | #define VHD_UNREACHABLE() __builtin_unreachable()
114 | 
115 | /* Verify is not compiled out in release builds */
116 | #define VHD_VERIFY(cond)                                  \
117 |     do {                                                  \
118 |         if (!(cond)) {                                    \
119 |             _vhd_verify_helper(#cond, __FILE__, __LINE__); \
120 |         }                                                 \
121 |     } while (0)
122 | 
123 | /*////////////////////////////////////////////////////////////////////////////*/
124 | 
125 | #ifdef VHD_MEMCHECK
126 | #   include <valgrind/memcheck.h>
127 | #   define VHD_MEMCHECK_DEFINED(addr, len)   \
128 |     VALGRIND_MAKE_MEM_DEFINED(addr, len)
129 | #   define VHD_MEMCHECK_UNDEFINED(addr, len) \
130 |     VALGRIND_MAKE_MEM_UNDEFINED(addr, len)
131 | #else
132 | #   define VHD_MEMCHECK_DEFINED(addr, len)
133 | #   define VHD_MEMCHECK_UNDEFINED(addr, len)
134 | #endif
135 | 
136 | /*////////////////////////////////////////////////////////////////////////////*/
137 | 
138 | #define VHD_ALIGN_UP(x, a) ({ \
139 |     VHD_TYPEOF(x) __mask = (VHD_TYPEOF(x))(a) - 1; \
140 |     ((x) + __mask) & ~__mask; \
141 | })
142 | #define VHD_ALIGN_DOWN(x, a)    ((x) & ~((VHD_TYPEOF(x))(a) - 1))
143 | #define VHD_IS_ALIGNED(x, a)    (!((x) & ((VHD_TYPEOF(x))(a) - 1)))
144 | #define VHD_ALIGN_PTR_UP(x, a)  (VHD_TYPEOF(x))VHD_ALIGN_UP((uintptr_t)x, a)
145 | 
146 | static inline void *vhd_alloc(size_t bytes)
147 | {
148 |     /* malloc actually accepts 0 sizes, but this is still most likely a bug.. */
149 |     VHD_ASSERT(bytes != 0);
150 | 
151 |     void *p = malloc(bytes);
152 |     VHD_VERIFY(p != NULL);
153 |     return p;
154 | }
155 | 
156 | static inline void *vhd_zalloc(size_t bytes)
157 | {
158 |     /* calloc actually accepts 0 sizes, but this is still most likely a bug.. */
159 |     VHD_ASSERT(bytes != 0);
160 | 
161 |     void *p = calloc(bytes, 1);
162 |     VHD_VERIFY(p != NULL);
163 |     return p;
164 | }
165 | 
166 | static inline void *vhd_calloc(size_t nmemb, size_t size)
167 | {
168 |     VHD_ASSERT(nmemb != 0 && size != 0);
169 | 
170 |     void *p = calloc(nmemb, size);
171 |     VHD_VERIFY(p != NULL);
172 |     return p;
173 | }
174 | 
175 | /* TODO: aligned alloc */
176 | 
177 | static inline void vhd_free(void *p)
178 | {
179 |     free(p);
180 | }
181 | 
182 | static inline char *vhd_strdup(const char *s) __attribute__((malloc));
183 | static inline char *vhd_strdup(const char *s)
184 | {
185 |     size_t len;
186 |     char *t;
187 | 
188 |     if (!s) {
189 |         return NULL;
190 |     }
191 | 
192 |     len = strlen(s) + 1;
193 |     t = (char *)vhd_alloc(len);
194 |     memcpy(t, s, len);
195 |     return t;
196 | }
197 | 
198 | static inline char *vhd_strdup_printf(const char *fmt, ...)
199 |     __attribute__((format(printf, 1, 2), malloc));
200 | static inline char *vhd_strdup_printf(const char *fmt, ...)
201 | {
202 |     int len;
203 |     size_t size;
204 |     char *ret;
205 |     va_list args;
206 | 
207 |     va_start(args, fmt);
208 |     len = vsnprintf(NULL, 0, fmt, args);
209 |     va_end(args);
210 | 
211 |     if (len < 0) {
212 |         return NULL;
213 |     }
214 | 
215 |     size = (size_t)len + 1;
216 |     ret = (char *)vhd_alloc(size);
217 | 
218 |     va_start(args, fmt);
219 |     len = vsnprintf(ret, size, fmt, args);
220 |     va_end(args);
221 | 
222 |     if (len < 0) {
223 |         vhd_free(ret);
224 |         return NULL;
225 |     }
226 |     return ret;
227 | }
228 | 
229 | int init_platform_page_size(void);
230 | 
231 | extern size_t platform_page_size;
232 | 
233 | #ifdef __cplusplus
234 | }
235 | #endif
236 | 


--------------------------------------------------------------------------------
/vdev.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <time.h>
  4 | 
  5 | #include "event.h"
  6 | #include "queue.h"
  7 | 
  8 | #include "virtio/virt_queue.h"
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | struct vhd_vdev;
 15 | struct vhd_vring;
 16 | struct vhd_request_queue;
 17 | 
 18 | /**
 19 |  * Vhost device type description.
 20 |  */
 21 | struct vhd_vdev_type {
 22 |     /* Human-readable description */
 23 |     const char *desc;
 24 | 
 25 |     /* Polymorphic type ops */
 26 |     uint64_t (*get_features)(struct vhd_vdev *vdev);
 27 |     int (*set_features)(struct vhd_vdev *vdev, uint64_t features);
 28 |     size_t (*get_config)(struct vhd_vdev *vdev, void *cfgbuf,
 29 |                          size_t bufsize, size_t offset);
 30 |     int (*dispatch_requests)(struct vhd_vdev *vdev, struct vhd_vring *vring);
 31 |     void (*free)(struct vhd_vdev *vdev);
 32 | };
 33 | 
 34 | struct vhd_memory_map;
 35 | struct vhd_memory_log;
 36 | struct vhd_work;
 37 | 
 38 | /**
 39 |  * Vhost generic device instance.
 40 |  *
 41 |  * Devices are polymorphic through their respective types.
 42 |  */
 43 | struct vhd_vdev {
 44 |     char *log_tag;
 45 | 
 46 |     /* Accosiated client private data */
 47 |     void *priv;
 48 | 
 49 |     /* Device type description */
 50 |     const struct vhd_vdev_type *type;
 51 | 
 52 |     /* Server socket fd when device is a vhost-user server */
 53 |     int listenfd;
 54 |     struct vhd_io_handler *listen_handler;
 55 | 
 56 |     /* Connected device fd. Single active connection per device. */
 57 |     int connfd;
 58 |     struct vhd_io_handler *conn_handler;
 59 | 
 60 |     /* Message currently being handled */
 61 |     uint32_t req;
 62 | 
 63 |     /* Timing for message handling */
 64 |     struct timespec msg_handling_started;
 65 |     int timerfd;
 66 |     struct vhd_io_handler *timer_handler;
 67 | 
 68 |     /* Attached request queues */
 69 |     struct vhd_request_queue **rqs;
 70 |     int num_rqs;
 71 | 
 72 |     /*
 73 |      * Vhost protocol features which can be supported for this vdev and
 74 |      * those which have been actually enabled during negotiation.
 75 |      */
 76 |     uint64_t supported_protocol_features;
 77 |     uint64_t negotiated_protocol_features;
 78 |     uint64_t supported_features;
 79 |     uint64_t negotiated_features;
 80 | 
 81 |     /* Maximum amount of request queues this device can support */
 82 |     uint16_t num_queues;
 83 |     struct vhd_vring *vrings; /* Total num_queues elements */
 84 | 
 85 |     /* Gets called after mapping guest memory region */
 86 |     int (*map_cb)(void *addr, size_t len);
 87 | 
 88 |     /* Gets called before unmapping guest memory region */
 89 |     int (*unmap_cb)(void *addr, size_t len);
 90 | 
 91 |     struct vhd_memory_map *memmap;
 92 |     struct vhd_memory_map *old_memmap;
 93 |     struct vhd_memory_log *memlog;
 94 |     struct vhd_memory_log *old_memlog;
 95 | 
 96 |     /**
 97 |      * Shared memory to store information about inflight requests and restore
 98 |      * virtqueue state after reconnect.
 99 |      */
100 |     struct inflight_split_region *inflight_mem;
101 |     uint64_t inflight_size;
102 | 
103 |     size_t pte_flush_byte_threshold;
104 |     int64_t bytes_left_before_pte_flush;
105 | 
106 |     /* #vrings which may have requests in flight */
107 |     uint16_t num_vrings_in_flight;
108 |     /* #vrings started and haven't yet acknowledged stop */
109 |     uint16_t num_vrings_started;
110 | 
111 |     /* callback and arg to be called when the device is released */
112 |     void (*release_cb)(void *);
113 |     void *release_arg;
114 | 
115 |     /** Global vdev list */
116 |     LIST_ENTRY(vhd_vdev) vdev_list;
117 | 
118 |     /* #vrings performing an action in response to a control message */
119 |     uint16_t num_vrings_handling_msg;
120 |     /* function to call once the current message is handled in all vrings */
121 |     int (*handle_complete)(struct vhd_vdev *vdev);
122 | 
123 |     /* whether an ACK should be sent once the message is handled  */
124 |     bool ack_pending;
125 |     bool pte_flush_pending;
126 | 
127 |     /* fd to keep open until handle_complete and to close there */
128 |     int keep_fd;
129 | 
130 |     struct vhd_work *work;
131 | };
132 | 
133 | /**
134 |  * Init new generic vhost device in server mode
135 |  * @socket_path     Listen socket path
136 |  * @type            Device type description
137 |  * @vdev            vdev instance to initialize
138 |  * @max_queues      Maximum number of queues this device can support
139 |  * @rqs             Associated request queues
140 |  * @num_rqs         Number of request queues
141 |  * @priv            User private data
142 |  * @map_cb          User function to call after mapping guest memory
143 |  * @unmap_cb        User function to call before unmapping guest memory
144 |  * @pte_flush_byte_threshold
145 |  *                  Number of bytes to process before flushing the PTEs
146 |  *                  of the guest address space
147 |  */
148 | int vhd_vdev_init_server(
149 |     struct vhd_vdev *vdev,
150 |     const char *socket_path,
151 |     const struct vhd_vdev_type *type,
152 |     int max_queues,
153 |     struct vhd_request_queue **rqs, int num_rqs,
154 |     void *priv,
155 |     int (*map_cb)(void *addr, size_t len),
156 |     int (*unmap_cb)(void *addr, size_t len),
157 |     size_t pte_flush_byte_threshold);
158 | 
159 | /**
160 |  * Stop vhost device.  Once this returns no more new requests will reach the
161 |  * backend.  @release_cb(@release_arg) will be called once all requests are
162 |  * completed and the associated resources released.
163 |  */
164 | int vhd_vdev_stop_server(struct vhd_vdev *vdev,
165 |                          void (*release_cb)(void *), void *release_arg);
166 | 
167 | /**
168 |  * Device vring instance
169 |  */
170 | struct vhd_vring {
171 |     struct vhd_vdev *vdev;
172 |     char *log_tag;
173 | 
174 |     int kickfd;
175 |     int callfd;
176 |     int errfd;
177 | 
178 |     /* started as seen from control plane */
179 |     bool started_in_ctl;
180 |     /* requested to disconnect */
181 |     bool disconnecting;
182 | 
183 |     /* Client kick event */
184 |     struct vhd_io_handler *kick_handler;
185 | 
186 |     /* called in control plane once vring is drained */
187 |     int (*on_drain_cb)(struct vhd_vring *);
188 | 
189 |     /*
190 |      * vq attributes that may change while vring is started; these are updated
191 |      * in the control event loop and propagated via BH into vq
192 |      */
193 |     struct {
194 |         uint64_t desc_addr;
195 |         uint64_t used_addr;
196 |         uint64_t avail_addr;
197 |         uint32_t flags;
198 |         uint64_t used_gpa_base;
199 |         void *desc;
200 |         void *used;
201 |         void *avail;
202 |         struct vhd_memory_map *mm;
203 |         struct vhd_memory_log *log;
204 |         bool enabled;
205 |     } shadow_vq;
206 | 
207 |     /*
208 |      * the fields below are only accessed in dataplane unless the vring is
209 |      * known to be stopped
210 |      */
211 |     struct virtio_virtq vq;
212 |     /* started as seen from dataplane */
213 |     bool started_in_rq;
214 |     /* #requests pending completion */
215 |     uint16_t num_in_flight;
216 |     /* #requests pending completion when the queue is requested to stop */
217 |     uint16_t num_in_flight_at_stop;
218 | };
219 | 
220 | #define VHD_VRING_FROM_VQ(ptr) containerof(ptr, struct vhd_vring, vq)
221 | 
222 | struct vhd_request_queue *vhd_get_rq_for_vring(struct vhd_vring *vring);
223 | 
224 | void vhd_vring_inc_in_flight(struct vhd_vring *vring);
225 | void vhd_vring_dec_in_flight(struct vhd_vring *vring);
226 | 
227 | #ifdef __cplusplus
228 | }
229 | #endif
230 | 


--------------------------------------------------------------------------------
/docs/logo.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <svg id="_Слой_1" data-name="Слой 1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 120 120">
  3 |   <defs>
  4 |     <style>
  5 |       .cls-1 {
  6 |         fill: url(#_Безымянный_градиент_18-4);
  7 |       }
  8 | 
  9 |       .cls-2 {
 10 |         fill: url(#_Безымянный_градиент_18);
 11 |       }
 12 | 
 13 |       .cls-3 {
 14 |         fill: url(#_Безымянный_градиент_2);
 15 |       }
 16 | 
 17 |       .cls-4 {
 18 |         fill: url(#_Безымянный_градиент_14-2);
 19 |       }
 20 | 
 21 |       .cls-5 {
 22 |         fill: url(#_Безымянный_градиент_18-6);
 23 |       }
 24 | 
 25 |       .cls-6 {
 26 |         fill: url(#_Безымянный_градиент_2-5);
 27 |       }
 28 | 
 29 |       .cls-7 {
 30 |         fill: url(#_Безымянный_градиент_2-7);
 31 |       }
 32 | 
 33 |       .cls-8 {
 34 |         fill: url(#_Безымянный_градиент_14);
 35 |       }
 36 | 
 37 |       .cls-9 {
 38 |         fill: url(#_Безымянный_градиент_2-3);
 39 |       }
 40 | 
 41 |       .cls-10 {
 42 |         fill: url(#_Безымянный_градиент_2-4);
 43 |       }
 44 | 
 45 |       .cls-11 {
 46 |         fill: url(#_Безымянный_градиент_18-3);
 47 |       }
 48 | 
 49 |       .cls-12 {
 50 |         fill: url(#_Безымянный_градиент_2-2);
 51 |       }
 52 | 
 53 |       .cls-13 {
 54 |         fill: url(#_Безымянный_градиент_18-5);
 55 |       }
 56 | 
 57 |       .cls-14 {
 58 |         fill: url(#_Безымянный_градиент_14-3);
 59 |       }
 60 | 
 61 |       .cls-15 {
 62 |         fill: url(#_Безымянный_градиент_18-2);
 63 |       }
 64 | 
 65 |       .cls-16 {
 66 |         fill: url(#_Безымянный_градиент_2-6);
 67 |       }
 68 |     </style>
 69 |     <linearGradient id="_Безымянный_градиент_2" data-name="Безымянный градиент 2" x1="-724.45" y1="-2398.56" x2="-724.45" y2="-2398.56" gradientTransform="translate(2483.99 622.99) rotate(-60)" gradientUnits="userSpaceOnUse">
 70 |       <stop offset="0" stop-color="#d2dfff"/>
 71 |       <stop offset="1" stop-color="#4e7fff"/>
 72 |     </linearGradient>
 73 |     <linearGradient id="_Безымянный_градиент_18" data-name="Безымянный градиент 18" x1="103.09" y1="32.22" x2="72.46" y2="46.19" gradientUnits="userSpaceOnUse">
 74 |       <stop offset="0" stop-color="#4e7fff"/>
 75 |       <stop offset="1" stop-color="#0dd1ff"/>
 76 |     </linearGradient>
 77 |     <linearGradient id="_Безымянный_градиент_18-2" data-name="Безымянный градиент 18" x1="119.32" y1="87.33" x2="79.87" y2="68.95" xlink:href="#_Безымянный_градиент_18"/>
 78 |     <linearGradient id="_Безымянный_градиент_2-2" data-name="Безымянный градиент 2" x1="215.25" y1="-1393.17" x2="215.25" y2="-1393.17" gradientTransform="translate(-1238.77 561.28) rotate(60)" xlink:href="#_Безымянный_градиент_2"/>
 79 |     <linearGradient id="_Безымянный_градиент_2-3" data-name="Безымянный градиент 2" x1="120.82" y1="45.79" x2="51.23" y2="62.45" gradientTransform="matrix(1, 0, 0, 1, 0, 0)" xlink:href="#_Безымянный_градиент_2"/>
 80 |     <linearGradient id="_Безымянный_градиент_18-3" data-name="Безымянный градиент 18" x1="11.35" y1="33.81" x2="41.24" y2="56.51" xlink:href="#_Безымянный_градиент_18"/>
 81 |     <linearGradient id="_Безымянный_градиент_18-4" data-name="Безымянный градиент 18" x1="54.96" y1=".63" x2="57.08" y2="37.39" xlink:href="#_Безымянный_градиент_18"/>
 82 |     <linearGradient id="_Безымянный_градиент_2-4" data-name="Безымянный градиент 2" x1="-692.24" y1="-2588.18" x2="-692.24" y2="-2588.18" gradientTransform="translate(2632.1 745.69) rotate(-60)" xlink:href="#_Безымянный_градиент_2"/>
 83 |     <linearGradient id="_Безымянный_градиент_14" data-name="Безымянный градиент 14" x1="67.17" y1="-1.29" x2="76.54" y2="50.89" gradientUnits="userSpaceOnUse">
 84 |       <stop offset="0" stop-color="#d2dfff"/>
 85 |       <stop offset=".48" stop-color="#4e7fff"/>
 86 |       <stop offset=".72" stop-color="#2b55bd"/>
 87 |       <stop offset=".91" stop-color="#153b93"/>
 88 |       <stop offset="1" stop-color="#0d3183"/>
 89 |     </linearGradient>
 90 |     <linearGradient id="_Безымянный_градиент_2-5" data-name="Безымянный градиент 2" x1="78.09" y1="45.4" x2="-37.7" y2="19.52" gradientTransform="matrix(1, 0, 0, 1, 0, 0)" xlink:href="#_Безымянный_градиент_2"/>
 91 |     <linearGradient id="_Безымянный_градиент_18-5" data-name="Безымянный градиент 18" x1="8.61" y1="93.12" x2="41.13" y2="76.65" xlink:href="#_Безымянный_градиент_18"/>
 92 |     <linearGradient id="_Безымянный_градиент_14-2" data-name="Безымянный градиент 14" x1="-2.05" y1="83.17" x2="38.01" y2="62.46" xlink:href="#_Безымянный_градиент_14"/>
 93 |     <linearGradient id="_Безымянный_градиент_14-3" data-name="Безымянный градиент 14" x1="130.37" y1="112.68" x2="68.79" y2="76.58" xlink:href="#_Безымянный_градиент_14"/>
 94 |     <linearGradient id="_Безымянный_градиент_18-6" data-name="Безымянный градиент 18" x1="64.43" y1="117.57" x2="62.96" y2="80.57" xlink:href="#_Безымянный_градиент_18"/>
 95 |     <linearGradient id="_Безымянный_градиент_2-6" data-name="Безымянный градиент 2" x1="796.41" y1="-2776.59" x2="796.41" y2="-2776.59" gradientTransform="translate(856.37 -2698.78) rotate(-180)" xlink:href="#_Безымянный_градиент_2"/>
 96 |     <linearGradient id="_Безымянный_градиент_2-7" data-name="Безымянный градиент 2" x1="26.11" y1="108.57" x2="75.6" y2="61.52" gradientTransform="matrix(1, 0, 0, 1, 0, 0)" xlink:href="#_Безымянный_градиент_2"/>
 97 |   </defs>
 98 |   <rect class="cls-3" x="44.55" y="51.1" width="0" height="0"/>
 99 |   <g>
100 |     <path class="cls-2" d="m111.92,30c.23,3.64-2.99,8.04-8.28,12.4l-.02.02c-6.89,1.01-16.16,3.78-28.23,8.68,2.03-5.67,3.4-12.17,4.17-18.58,16.61-6.2,28.44-8.27,32.37-2.52Z"/>
101 |     <path class="cls-15" d="m111.92,90c-1.5,1.99-5.82,1.35-11.53-1.07-3.76-4.98-10.33-10.41-18.95-16.14,5.73-1.82,11.06-3.91,15.78-6.18,10.52,7.46,17.9,18.07,14.71,23.39Z"/>
102 |     <rect class="cls-12" x="75.38" y="51.1" width="0" height="0"/>
103 |     <path class="cls-9" d="m97.21,66.61c-4.72,2.27-10.05,4.36-15.78,6.18-1.62.51-3.27,1-4.96,1.45-5.28,1.47-10.82,2.67-16.51,3.57l-5.71,1.09c1.2-6.5,3.08-13.02,5.71-18.9,6.41-.66,12.96-2.27,19.2-4.47l.04-.02c9.6-3.38,18.37-8.19,24.42-13.09l.02-.02c5.3-4.36,8.51-8.75,8.28-12.4,2.61,4.53,4.6,9.35,5.95,14.35,1.52,7.51-7.01,15.69-20.66,22.26Z"/>
104 |   </g>
105 |   <g>
106 |     <path class="cls-11" d="m44.54,51.1s-10,2.59-18.18,5.69c-13.66-11.29-21.38-20.49-18.35-26.79,3.02-2.01,8.45-1.45,14.88,1,4.34,5.45,11.38,12.08,21.66,20.1Z"/>
107 |     <path class="cls-1" d="m64.81,10.52c-2.44,5.75-3.85,14.16-4.51,24.5-4.43-4.08-8.92-7.65-13.24-10.59C48.28,11.57,53.78-.09,59.96,0c2.48.3,4.1,4.38,4.85,10.52Z"/>
108 |     <rect class="cls-10" x="44.55" y="51.1" width="0" height="0"/>
109 |     <g>
110 |       <path class="cls-8" d="m79.56,32.52c-.77,6.41-2.14,12.91-4.17,18.58-3.63-4.47-7.46-8.68-11.35-12.53,1.41-10.44,1.69-20.72.77-28.05C64.06,4.38,62.44.3,59.96,0c5.15,0,10.33.66,15.42,2.01,4.41,1.18,5.97,15.52,4.17,30.51Z"/>
111 |       <path class="cls-6" d="m79.2,55.51l-.04.02c-6.24,2.2-12.79,3.81-19.2,4.47-3.78-5.22-8.47-10.11-13.49-14.41-7.76-6.63-16.31-11.83-23.59-14.6-6.42-2.44-11.85-3.01-14.88-1,2.63-4.51,5.8-8.66,9.45-12.34,5.75-5.07,17.09-1.77,29.61,6.76,4.32,2.95,8.81,6.52,13.24,10.59,1.24,1.15,2.5,2.33,3.74,3.55,3.89,3.85,7.72,8.06,11.35,12.53l3.81,4.41Z"/>
112 |     </g>
113 |   </g>
114 |   <g>
115 |     <path class="cls-13" d="m35.6,88.95c-11.72,5.39-24.59,6.46-27.61,1.05-.98-2.31,1.73-5.71,6.69-9.45,6.18-.75,14.18-3.76,23.48-8.36-.2,4.13-1.46,12.14-2.55,16.76Z"/>
116 |     <path class="cls-4" d="m44.54,51.1c-2.07,5.37-3.79,10.78-5.17,16.08-9.77,4.02-18.82,8.92-24.7,13.38-4.96,3.74-7.66,7.14-6.69,9.45-2.57-4.45-4.58-9.28-5.95-14.35-1.69-6.29,22.69-20.95,42.51-24.55Z"/>
117 |     <path class="cls-14" d="m111.92,90c-2.57,4.45-5.73,8.62-9.45,12.34-3.23,3.23-16.42-2.57-28.5-11.63-5.17-3.85-10.13-8.3-14.01-12.91,5.69-.9,11.23-2.1,16.51-3.57,8.36,6.44,17.13,11.83,23.91,14.69,5.71,2.42,10.03,3.06,11.53,1.07Z"/>
118 |     <path class="cls-5" d="m73.98,90.71c-2.95,17.47-7.06,28.76-14.01,29.29-3.27-1.63-5.49-6.61-6.57-13.38,2.55-6.48,4.77-15.89,6.57-28.82,3.89,4.6,8.85,9.05,14.01,12.91Z"/>
119 |     <rect class="cls-16" x="59.97" y="77.81" width="0" height="0"/>
120 |     <path class="cls-7" d="m59.96,120c-5.22,0-10.41-.68-15.4-2.01-7.27-2.44-10.07-13.92-8.94-29.04.38-5.22,1.24-10.88,2.54-16.76.38-1.63.77-3.31,1.22-5v-.02c1.37-5.3,3.1-10.71,5.17-16.08l1.93-5.5c5.02,4.3,9.71,9.19,13.49,14.41-2.63,5.88-4.51,12.4-5.71,18.9-1.88,10.03-2.1,20.02-.86,27.73,1.09,6.76,3.31,11.74,6.57,13.38Z"/>
121 |   </g>
122 | </svg>


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
  1 | # libvhost-server architecture
  2 | 
  3 | `libvhost-server` (henceforth `libvhost`) is a component to help implementing
  4 | vhost-user device servers.  Its main purpose is to insulate the server from the
  5 | vhost-user control protocol, memory mapping, address translation, virtio queue
  6 | processing, and so on.
  7 | 
  8 | It's designed to transfer requests and responses between the guest drivers and
  9 | the backend efficiently, with as little added latency and device's mutual
 10 | interference as possible.
 11 | 
 12 | This component is implemented as a library.  An application implementing a
 13 | vhost-user device (slave in vhost-user speak) links to this library and calls
 14 | into the API it exposes (so it's called _user_ hereinafter); this allows it to
 15 | accept connections from a _client_ (vhost-user master, e.g. qemu) process its
 16 | virtio queues.
 17 | 
 18 | ## Lockless event loops and bottom halves
 19 | 
 20 | For efficiency in a highly-concurrent environment the library implements
 21 | lockless event loops: it tends to avoid using sleeping synchronization
 22 | primitives like mutexes.
 23 | 
 24 | To coordinate different contexts that execute event loops, _bottom halves_ are
 25 | used: functions that are scheduled to run (soon) on the target event loop.
 26 | 
 27 | ## Contexts of execution
 28 | 
 29 | The library assumes there are different kinds of execution contexts:
 30 | 
 31 | 1. dataplane aka request queue event loop
 32 | 
 33 |    This is where virtio queues are processed.  There may be multiple request
 34 |    queue event loops.  Typically every request queue event loop is run in its
 35 |    own thread.  Every virtio queue is associated permanently with a request
 36 |    queue.  _Currently all virtio queues of a device are associated with a
 37 |    single request queue, but this limitation will be lifted._
 38 | 
 39 |    Request queue event loop is supposed to be run explicitly by the user.  On
 40 |    each iteration, the event loop blocks until a host notification is signaled
 41 |    on any of its virtio queues.  Once it's woken up, it extracts all available
 42 |    virtio elements from all signaled virtio queues and forms device requests
 43 |    out of them.  It may then process some simple ones of them synchronously;
 44 |    otherwise it enqueues them in a double-ended queue, common for all
 45 |    associated virtio queues (this allows to avoid starvation).
 46 | 
 47 |    The user dequeues the requests from this request queue and submits them for
 48 |    asynchronous processing in another context outside of `libvhost` scope.
 49 | 
 50 |    Once the request is fully processed, it submits a completion function (via
 51 |    bottom half) back onto the request queue event loop; this leads to releasing
 52 |    the resources associated with the request and publishing the result to the
 53 |    client.
 54 | 
 55 | 2. control event loop
 56 | 
 57 |    This is a **single** library-global event loop handling state transitions
 58 |    and vhost-user socket communications for all devices served by the library.
 59 |    The thread running the control event loop is owned by the library: it's
 60 |    created at the library initialization and stopped on deinitialization.
 61 | 
 62 | 3. external contexts
 63 | 
 64 |    These are external contexts that initialize and deinitialize devices to be
 65 |    handled by the library.  These operations are **not** lockless: the
 66 |    respective functions block until the control loop acknowledges the requested
 67 |    state transition of the device.
 68 | 
 69 | ## Device state machine
 70 | 
 71 | Device (`struct vhd_vdev`) state transitions happen in response to client
 72 | actions -- connect, disconnect, vhost-user control messages, and user actions
 73 | -- device start and stop.  They all happen in the control event loop.
 74 | 
 75 | However, some of the device state transitions require associated state
 76 | transitions in the device virtio queues (`struct vhd_vring`).  In order to
 77 | maintain lockless nature of the library, such device state transitions happen
 78 | in several stages:
 79 | 
 80 | - first the transition is started in the control event loop where input
 81 |   parameters are verified and internal state is prepared, to be later exposed
 82 |   in the dataplane
 83 | 
 84 | - then corresponding state transitions of the virtio queues in the respective
 85 |   request queue event loop(s) are scheduled via bottom halves
 86 | 
 87 | - those, in turn, put the internal state prepared earlier into effect in the
 88 |   dataplane and signal completion of the transition via bottom halves back to
 89 |   the control event loop
 90 | 
 91 | - then the device state transition finishes and the reply is sent to the
 92 |   client, if needed.
 93 | 
 94 | The messages from a single client are never handled concurrently: upon
 95 | reception of a message the state machine suspends reception of further messages
 96 | on the socket until the current one is fully handled and the reply is sent.
 97 | The only action which may intervene is the device disconnection, either due to
 98 | the client shutting down its end or due to the external context issuing the
 99 | device stop.
100 | 
101 | ### User-initiated device stop or client disconnection
102 | 
103 | Certain complexity arises from the fact that when the device disconnection
104 | happens some requests on the device may have been dequeued from the request
105 | queue and submitted to the backend for asynchronous handling.
106 | 
107 | Therefore disconnection is multi-stage:
108 | 
109 | - if the disconnection is initiated by the user doing a device stop call in an
110 |   external context, it passes the request to the device in the control event
111 |   loop and blocks on a semaphore
112 | 
113 | - the socket connected to the client is closed
114 | 
115 | - all virtio queues are requested to stop via bottom halves in the request
116 |   queue event loop(s)
117 | 
118 | - in the request queue event loop
119 | 
120 |   * the virtio queue is stopped, i.e. no more requests are fetched from the
121 |     virtio queue
122 | 
123 |   * the requests that have already been fetched from the virtio queue but still
124 |     remain in the request queue are canceled: completed immediately with a
125 |     special status such that the completion is not exposed to the client,
126 |     effectively dropping the request, in the expectation that it will be
127 |     resubmitted by the vhost-user in_flight mechanism; this ensures that no
128 |     more requests from this virtio queue will enter the backend
129 | 
130 |   * the virtio queue acknowledges the stop to the device in the control event
131 |     loop context via a bottom half
132 | 
133 | - if the disconnection was user-initiated, once the device sees all its virtio
134 |   queues acknowledge the stop, it releases the semaphore so that the device
135 |   stop call unblocks and returns in its external context; from this point on
136 |   the backend is guaranteed that no more requests will be submitted
137 | 
138 | - once all device requests that were caught in the backend by the disconnection
139 |   are completed, leaving no more requests in the whole pipeline and thus
140 |   ensuring that nothing will touch the guest memory any more, the device
141 |   proceeds with the cleanup:
142 | 
143 |   * if it the disconnection was user-initiated, the device shuts itself down,
144 |     closes the listening socket, releases remaining resources and executes a
145 |     previously set up callback to inform the external context that the device
146 |     is fully terminated and freed
147 | 
148 |   * otherwise the device resets its state and resumes listening for incoming
149 |     connections.
150 | 
151 | ### Live migration support
152 | 
153 | Live migration support basically adheres to the virtio spec, with a notable
154 | extension, which appears underspecified there:
155 | 
156 | Before the `VHOST_USER_GET_VRING_BASE` message is replied to, all requests in
157 | the virtio queue are drained and completed to the client, leaving no in-flight
158 | requests and thus making it safe to resume operation upon migration.
159 | 
160 | ### Reconnection support
161 | 
162 | The library supports starting in a mode where the client survived the server
163 | premature termination and wants to re-establish the connection and resume
164 | operation.
165 | 
166 | The library basically adheres to the spec in this regard, with a few points of
167 | note:
168 | 
169 | - the in-flight region is initially created in a memfd (there's no requirement
170 |   that it's in memfd when the connection is re-established0)
171 | 
172 | - when the device is stopped by the user while there still is an open
173 |   connection with the client, the requests that happen to be in flight are
174 |   canceled, ensuring that the backend internals don't touch the client's memory
175 |   after the stop call is acknowledged.
176 | 


--------------------------------------------------------------------------------
/server.c:
--------------------------------------------------------------------------------
  1 | #include <pthread.h>
  2 | 
  3 | #include "platform.h"
  4 | #include "server_internal.h"
  5 | #include "queue.h"
  6 | #include "bio.h"
  7 | #include "logging.h"
  8 | #include "vdev.h"
  9 | 
 10 | #define VHOST_EVENT_LOOP_EVENTS 128
 11 | 
 12 | static struct vhd_event_loop *g_vhost_evloop;
 13 | static pthread_t g_vhost_thread;
 14 | 
 15 | static inline void free_vhost_event_loop(void)
 16 | {
 17 |     vhd_free_event_loop(g_vhost_evloop);
 18 |     g_vhost_evloop = NULL;
 19 | }
 20 | 
 21 | static __thread bool g_is_ctl_thread;
 22 | 
 23 | bool vhd_in_ctl_thread(void)
 24 | {
 25 |     return g_is_ctl_thread;
 26 | }
 27 | 
 28 | static void *vhost_evloop_func(void *arg)
 29 | {
 30 |     int res;
 31 | 
 32 |     g_is_ctl_thread = true;
 33 | 
 34 |     do {
 35 |         res = vhd_run_event_loop(g_vhost_evloop, -1);
 36 |     } while (res == -EAGAIN);
 37 | 
 38 |     if (res < 0) {
 39 |         VHD_LOG_ERROR("vhost event loop iteration failed: %d", res);
 40 |     }
 41 | 
 42 |     return NULL;
 43 | }
 44 | 
 45 | int vhd_start_vhost_server(log_function log_fn)
 46 | {
 47 |     int res;
 48 | 
 49 |     res = init_platform_page_size();
 50 |     if (res != 0) {
 51 |         VHD_LOG_ERROR("failed to init platform page size: %d", res);
 52 |         return -res;
 53 |     }
 54 | 
 55 |     if (g_vhost_evloop != NULL) {
 56 |         return 0;
 57 |     }
 58 | 
 59 |     g_log_fn = log_fn;
 60 | 
 61 |     g_vhost_evloop = vhd_create_event_loop(VHOST_EVENT_LOOP_EVENTS);
 62 |     if (!g_vhost_evloop) {
 63 |         VHD_LOG_ERROR("failed to create vhost event loop");
 64 |         return -EIO;
 65 |     }
 66 | 
 67 |     res = pthread_create(&g_vhost_thread, NULL, vhost_evloop_func, NULL);
 68 |     if (res != 0) {
 69 |         VHD_LOG_ERROR("failed to start vhost event loop thread: %d", res);
 70 |         free_vhost_event_loop();
 71 |         return -res;
 72 |     }
 73 | 
 74 |     return 0;
 75 | }
 76 | 
 77 | void vhd_stop_vhost_server(void)
 78 | {
 79 |     if (!g_vhost_evloop) {
 80 |         return;
 81 |     }
 82 | 
 83 |     vhd_terminate_event_loop(g_vhost_evloop);
 84 |     pthread_join(g_vhost_thread, NULL);
 85 |     free_vhost_event_loop();
 86 | }
 87 | 
 88 | struct vhd_io_handler *vhd_add_vhost_io_handler(int fd,
 89 |                                                 int (*read)(void *opaque),
 90 |                                                 void *opaque)
 91 | {
 92 |     return vhd_add_io_handler(g_vhost_evloop, fd, read, opaque);
 93 | }
 94 | 
 95 | void vhd_run_in_ctl(void (*cb)(void *), void *opaque)
 96 | {
 97 |     vhd_bh_schedule_oneshot(g_vhost_evloop, cb, opaque);
 98 | }
 99 | 
100 | int vhd_submit_ctl_work_and_wait(void (*func)(struct vhd_work *, void *),
101 |                                  void *opaque)
102 | {
103 |     return vhd_submit_work_and_wait(g_vhost_evloop, func, opaque);
104 | }
105 | 
106 | /*////////////////////////////////////////////////////////////////////////////*/
107 | 
108 | /*
109 |  * Request queues
110 |  */
111 | 
112 | typedef SLIST_HEAD(, vhd_io) vhd_io_list;
113 | 
114 | /* TODO: bounded queue */
115 | struct vhd_request_queue {
116 |     struct vhd_event_loop *evloop;
117 | 
118 |     TAILQ_HEAD(, vhd_io) submission;
119 |     TAILQ_HEAD(, vhd_io) inflight;
120 |     vhd_io_list completion;
121 | 
122 |     struct vhd_bh *completion_bh;
123 |     struct vhd_rq_metrics metrics;
124 | };
125 | 
126 | void vhd_run_in_rq(struct vhd_request_queue *rq, void (*cb)(void *),
127 |                    void *opaque)
128 | {
129 |     vhd_bh_schedule_oneshot(rq->evloop, cb, opaque);
130 | }
131 | 
132 | static void req_complete(struct vhd_io *io)
133 | {
134 |     /* completion_handler destroys bio. save vring for unref */
135 |     struct vhd_vring *vring = io->vring;
136 |     io->completion_handler(io);
137 |     vhd_vring_dec_in_flight(vring);
138 | }
139 | 
140 | static void rq_complete_bh(void *opaque)
141 | {
142 |     struct vhd_request_queue *rq = opaque;
143 |     vhd_io_list io_list, io_list_reverse;
144 | 
145 |     SLIST_INIT(&io_list);
146 |     SLIST_INIT(&io_list_reverse);
147 |     /* steal completion list from rq, swap for a fresh one */
148 |     SLIST_MOVE_ATOMIC(&io_list_reverse, &rq->completion);
149 | 
150 |     /* the list was filled LIFO, we want the completions FIFO */
151 |     for (;;) {
152 |         struct vhd_io *io = SLIST_FIRST(&io_list_reverse);
153 |         if (!io) {
154 |             break;
155 |         }
156 |         SLIST_REMOVE_HEAD(&io_list_reverse, completion_link);
157 |         SLIST_INSERT_HEAD(&io_list, io, completion_link);
158 |     }
159 | 
160 |     for (;;) {
161 |         struct vhd_io *io = SLIST_FIRST(&io_list);
162 |         if (!io) {
163 |             break;
164 |         }
165 |         SLIST_REMOVE_HEAD(&io_list, completion_link);
166 |         TAILQ_REMOVE(&rq->inflight, io, inflight_link);
167 |         req_complete(io);
168 |         ++rq->metrics.completed;
169 |     }
170 | 
171 |     struct vhd_io *io = TAILQ_FIRST(&rq->inflight);
172 |     rq->metrics.oldest_inflight_ts = io ? io->ts : 0;
173 | }
174 | 
175 | struct vhd_request_queue *vhd_create_request_queue(void)
176 | {
177 |     struct vhd_request_queue *rq = vhd_alloc(sizeof(*rq));
178 | 
179 |     rq->evloop = vhd_create_event_loop(VHD_EVENT_LOOP_DEFAULT_MAX_EVENTS);
180 |     if (!rq->evloop) {
181 |         vhd_free(rq);
182 |         return NULL;
183 |     }
184 | 
185 |     TAILQ_INIT(&rq->submission);
186 |     TAILQ_INIT(&rq->inflight);
187 |     SLIST_INIT(&rq->completion);
188 |     rq->completion_bh = vhd_bh_new(rq->evloop, rq_complete_bh, rq);
189 |     memset(&rq->metrics, 0, sizeof(rq->metrics));
190 |     return rq;
191 | }
192 | 
193 | void vhd_release_request_queue(struct vhd_request_queue *rq)
194 | {
195 |     assert(TAILQ_EMPTY(&rq->submission));
196 |     assert(TAILQ_EMPTY(&rq->inflight));
197 |     assert(SLIST_EMPTY(&rq->completion));
198 |     vhd_bh_delete(rq->completion_bh);
199 |     vhd_free_event_loop(rq->evloop);
200 |     vhd_free(rq);
201 | }
202 | 
203 | struct vhd_io_handler *vhd_add_rq_io_handler(struct vhd_request_queue *rq,
204 |                                              int fd, int (*read)(void *opaque),
205 |                                              void *opaque)
206 | {
207 |     return vhd_add_io_handler(rq->evloop, fd, read, opaque);
208 | }
209 | 
210 | int vhd_run_queue(struct vhd_request_queue *rq)
211 | {
212 |     return vhd_run_event_loop(rq->evloop, -1);
213 | }
214 | 
215 | void vhd_stop_queue(struct vhd_request_queue *rq)
216 | {
217 |     vhd_terminate_event_loop(rq->evloop);
218 | }
219 | 
220 | bool vhd_dequeue_request(struct vhd_request_queue *rq,
221 |                          struct vhd_request *out_req)
222 | {
223 |     struct vhd_io *io = TAILQ_FIRST(&rq->submission);
224 | 
225 |     if (!io) {
226 |         return false;
227 |     }
228 | 
229 |     TAILQ_REMOVE(&rq->submission, io, submission_link);
230 | 
231 |     time_t now = time(NULL);
232 |     io->ts = now;
233 |     TAILQ_INSERT_TAIL(&rq->inflight, io, inflight_link);
234 |     if (!rq->metrics.oldest_inflight_ts) {
235 |         rq->metrics.oldest_inflight_ts = now;
236 |     }
237 | 
238 |     out_req->vdev = io->vring->vdev;
239 |     out_req->io = io;
240 | 
241 |     catomic_inc(&rq->metrics.dequeued);
242 |     return true;
243 | }
244 | 
245 | int vhd_enqueue_request(struct vhd_request_queue *rq, struct vhd_io *io)
246 | {
247 |     vhd_vring_inc_in_flight(io->vring);
248 | 
249 |     TAILQ_INSERT_TAIL(&rq->submission, io, submission_link);
250 |     catomic_inc(&rq->metrics.enqueued);
251 |     return 0;
252 | }
253 | 
254 | void vhd_cancel_queued_requests(struct vhd_request_queue *rq,
255 |                                 const struct vhd_vring *vring)
256 | {
257 |     struct vhd_io *io = TAILQ_FIRST(&rq->submission);
258 | 
259 |     while (io) {
260 |         struct vhd_io *next = TAILQ_NEXT(io, submission_link);
261 |         if (unlikely(io->vring == vring)) {
262 |             TAILQ_REMOVE(&rq->submission, io, submission_link);
263 |             io->status = VHD_BDEV_CANCELED;
264 |             req_complete(io);
265 |             catomic_inc(&rq->metrics.cancelled);
266 |         }
267 |         io = next;
268 |     }
269 | }
270 | 
271 | /*
272 |  * can be called from arbitrary thread; will schedule completion on the rq
273 |  * event loop
274 |  */
275 | void vhd_complete_bio(struct vhd_io *io, enum vhd_bdev_io_result status)
276 | {
277 |     struct vhd_request_queue *rq;
278 | 
279 |     io->status = status;
280 |     rq = vhd_get_rq_for_vring(io->vring);
281 | 
282 |     /*
283 |      * if this is not the first completion on the list scheduling the bh can be
284 |      * skipped because the first one must have done so
285 |      */
286 |     if (!SLIST_INSERT_HEAD_ATOMIC(&rq->completion, io, completion_link)) {
287 |         vhd_bh_schedule(rq->completion_bh);
288 |     }
289 |     catomic_inc(&rq->metrics.completions_received);
290 | }
291 | 
292 | void vhd_get_rq_stat(struct vhd_request_queue *rq,
293 |                      struct vhd_rq_metrics *metrics)
294 | {
295 |     *metrics = rq->metrics;
296 | }
297 | 


--------------------------------------------------------------------------------
/memmap.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <inttypes.h>
  3 | #include <sys/stat.h>
  4 | #include <sys/mman.h>
  5 | 
  6 | #include "queue.h"
  7 | #include "memmap.h"
  8 | #include "platform.h"
  9 | #include "logging.h"
 10 | #include "objref.h"
 11 | #include "server_internal.h"
 12 | 
 13 | struct vhd_mmap_callbacks {
 14 |     /* gets called after mapping guest memory region */
 15 |     int (*map_cb)(void *addr, size_t len);
 16 |     /* gets called before unmapping guest memory region */
 17 |     int (*unmap_cb)(void *addr, size_t len);
 18 | };
 19 | 
 20 | struct vhd_memory_region {
 21 |     struct objref ref;
 22 | 
 23 |     /* start of the region in guest physical space */
 24 |     uint64_t gpa;
 25 |     /* start of the region in master's virtual space */
 26 |     uint64_t uva;
 27 |     /* start of the region in this process' virtual space */
 28 |     void *ptr;
 29 |     /* region size */
 30 |     size_t size;
 31 |     /* offset of the region from the file base */
 32 |     off_t offset;
 33 | 
 34 |     /* unique identifiers of this region for caching purposes */
 35 |     dev_t device;
 36 |     ino_t inode;
 37 | 
 38 |     /*
 39 |      * file descriptor this region was created from. Note that this is only
 40 |      * set to a valid value if the region was created with preserve_fd, -1
 41 |      * otherwise.
 42 |      */
 43 |     int fd;
 44 | 
 45 |     /* callbacks associated with this memory region */
 46 |     struct vhd_mmap_callbacks callbacks;
 47 | 
 48 |     LIST_ENTRY(vhd_memory_region) region_link;
 49 | };
 50 | 
 51 | static LIST_HEAD(, vhd_memory_region) g_regions =
 52 |     LIST_HEAD_INITIALIZER(g_regions);
 53 | 
 54 | size_t platform_page_size;
 55 | 
 56 | static int region_init_id(struct vhd_memory_region *reg, int fd, bool preserve_fd)
 57 | {
 58 |     struct stat stat;
 59 | 
 60 |     if (preserve_fd) {
 61 |         reg->fd = dup(fd);
 62 |         if (reg->fd < 0) {
 63 |             int err = errno;
 64 |             VHD_LOG_ERROR("unable to dup memory region %p-%p fd: %s",
 65 |                           reg->ptr, reg->ptr + reg->size, strerror(err));
 66 |             return -err;
 67 |         }
 68 |     } else {
 69 |         reg->fd = -1;
 70 |     }
 71 | 
 72 |     if (fstat(fd, &stat) < 0) {
 73 |         return 0;
 74 |     }
 75 | 
 76 |     reg->device = stat.st_dev;
 77 |     reg->inode = stat.st_ino;
 78 | 
 79 |     return 0;
 80 | }
 81 | 
 82 | /*
 83 |  * This should be no less than VHOST_USER_MEM_REGIONS_MAX, to accept any
 84 |  * allowed VHOST_USER_SET_MEM_TABLE message.  The master may use more via
 85 |  * VHOST_USER_ADD_MEM_REG message if VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
 86 |  * is negotiated.
 87 |  */
 88 | #define VHD_RAM_SLOTS_MAX 32
 89 | 
 90 | size_t vhd_memmap_max_memslots(void)
 91 | {
 92 |     return VHD_RAM_SLOTS_MAX;
 93 | }
 94 | 
 95 | struct vhd_memory_map {
 96 |     struct objref ref;
 97 | 
 98 |     struct vhd_mmap_callbacks callbacks;
 99 | 
100 |     /* actual number of slots used */
101 |     unsigned num;
102 |     struct vhd_memory_region *regions[VHD_RAM_SLOTS_MAX];
103 | };
104 | 
105 | /*
106 |  * Returns actual pointer where uva points to
107 |  * or NULL in case of mapping absence
108 |  */
109 | void *uva_to_ptr(struct vhd_memory_map *mm, uint64_t uva)
110 | {
111 |     unsigned i;
112 | 
113 |     for (i = 0; i < mm->num; i++) {
114 |         struct vhd_memory_region *reg = mm->regions[i];
115 |         if (uva >= reg->uva && uva - reg->uva < reg->size) {
116 |             return reg->ptr + (uva - reg->uva);
117 |         }
118 |     }
119 | 
120 |     return NULL;
121 | }
122 | 
123 | static void *map_memory(size_t len, int fd, off_t offset)
124 | {
125 |     size_t aligned_len, map_len;
126 |     void *addr;
127 | 
128 |     /*
129 |      * Some apps map memory in very small chunks, make sure it's at least the
130 |      * size of a page so that remap doesn't fail later on.
131 |      */
132 |     len = VHD_ALIGN_UP(len, platform_page_size);
133 | 
134 |     aligned_len = VHD_ALIGN_PTR_UP(len, HUGE_PAGE_SIZE);
135 |     map_len = aligned_len + HUGE_PAGE_SIZE + platform_page_size;
136 | 
137 |     char *map = mmap(NULL, map_len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1,
138 |                      0);
139 |     if (map == MAP_FAILED) {
140 |         VHD_LOG_ERROR("unable to map memory: %s", strerror(errno));
141 |         return MAP_FAILED;
142 |     }
143 | 
144 |     char *aligned_addr = VHD_ALIGN_PTR_UP(map + platform_page_size, HUGE_PAGE_SIZE);
145 |     addr = mmap(aligned_addr, len, PROT_READ | PROT_WRITE,
146 |                 MAP_SHARED | MAP_FIXED, fd, offset);
147 |     if (addr == MAP_FAILED) {
148 |         VHD_LOG_ERROR("unable to remap memory region %p-%p: %s", aligned_addr,
149 |                       aligned_addr + len, strerror(errno));
150 |         munmap(map, map_len);
151 |         return MAP_FAILED;
152 |     }
153 |     aligned_addr = addr;
154 | 
155 |     size_t tail_len = aligned_len - len;
156 |     if (tail_len) {
157 |         char *tail = aligned_addr + len;
158 |         addr = mmap(tail, tail_len, PROT_READ | PROT_WRITE,
159 |                     MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
160 |         if (addr == MAP_FAILED) {
161 |             VHD_LOG_ERROR("unable to remap memory region %p-%p: %s", tail,
162 |                           tail + tail_len, strerror(errno));
163 |             munmap(map, map_len);
164 |             return MAP_FAILED;
165 |         }
166 |     }
167 | 
168 |     char *start = aligned_addr - platform_page_size;
169 |     char *end = aligned_addr + aligned_len + platform_page_size;
170 |     munmap(map, start - map);
171 |     munmap(end, map + map_len - end);
172 | 
173 |     return aligned_addr;
174 | }
175 | 
176 | static int unmap_memory(void *addr, size_t len)
177 | {
178 |     size_t map_len = VHD_ALIGN_PTR_UP(len, HUGE_PAGE_SIZE) + platform_page_size * 2;
179 |     char *map = addr - platform_page_size;
180 |     return munmap(map, map_len);
181 | }
182 | 
183 | static int map_region(struct vhd_memory_region *region, uint64_t gpa,
184 |                       uint64_t uva, size_t size, int fd, off_t offset,
185 |                       bool preserve_fd)
186 | {
187 |     void *ptr;
188 |     int ret;
189 | 
190 |     ptr = map_memory(size, fd, offset);
191 |     if (ptr == MAP_FAILED) {
192 |         int ret = -errno;
193 |         VHD_LOG_ERROR("can't mmap memory: %s", strerror(-ret));
194 |         return ret;
195 |     }
196 | 
197 |     region->ptr = ptr;
198 |     region->gpa = gpa;
199 |     region->uva = uva;
200 |     region->size = size;
201 |     region->offset = offset;
202 | 
203 |     ret = region_init_id(region, fd, preserve_fd);
204 |     if (ret < 0) {
205 |         munmap(ptr, size);
206 |         return ret;
207 |     }
208 | 
209 |     if (region->callbacks.map_cb) {
210 |         size_t len = VHD_ALIGN_PTR_UP(size, HUGE_PAGE_SIZE);
211 |         ret = region->callbacks.map_cb(ptr, len);
212 |         if (ret < 0) {
213 |             VHD_LOG_ERROR("map callback failed for region %p-%p: %s",
214 |                           ptr, ptr + len, strerror(-ret));
215 |             munmap(ptr, size);
216 |             return ret;
217 |         }
218 |     }
219 | 
220 |     /* Mark memory as defined explicitly */
221 |     VHD_MEMCHECK_DEFINED(ptr, size);
222 | 
223 |     return 0;
224 | }
225 | 
226 | static int unmap_region(struct vhd_memory_region *reg)
227 | {
228 |     int ret;
229 | 
230 |     if (reg->callbacks.unmap_cb) {
231 |         size_t len = VHD_ALIGN_PTR_UP(reg->size, HUGE_PAGE_SIZE);
232 |         ret = reg->callbacks.unmap_cb(reg->ptr, len);
233 |         if (ret < 0) {
234 |             VHD_LOG_ERROR("unmap callback failed for region %p-%p: %s",
235 |                           reg->ptr, reg->ptr + reg->size, strerror(-ret));
236 |             return ret;
237 |         }
238 |     }
239 | 
240 |     ret = unmap_memory(reg->ptr, reg->size);
241 |     if (ret < 0) {
242 |         VHD_LOG_ERROR("failed to unmap region at %p", reg->ptr);
243 |         return ret;
244 |     }
245 | 
246 |     return 0;
247 | }
248 | 
249 | static void region_do_release(struct vhd_memory_region *reg)
250 | {
251 |     VHD_ASSERT(vhd_in_ctl_thread());
252 | 
253 |     LIST_REMOVE(reg, region_link);
254 |     unmap_region(reg);
255 |     if (reg->fd >= 0) {
256 |         close(reg->fd);
257 |     }
258 |     vhd_free(reg);
259 | }
260 | 
261 | static void reap_regions_bh(void *unused)
262 | {
263 |     struct vhd_memory_region *reg, *tmp_reg;
264 | 
265 |     VHD_ASSERT(vhd_in_ctl_thread());
266 | 
267 |     LIST_FOREACH_SAFE(reg, &g_regions, region_link, tmp_reg) {
268 |         if (objref_read(&reg->ref) != 0) {
269 |             continue;
270 |         }
271 | 
272 |         region_do_release(reg);
273 |     }
274 | }
275 | 
276 | static void region_release(struct objref *objref)
277 | {
278 |     struct vhd_memory_region *reg =
279 |             containerof(objref, struct vhd_memory_region, ref);
280 | 
281 |     if (vhd_in_ctl_thread()) {
282 |         /*
283 |          * Only the control thread gets the right to actually delete regions.
284 |          * All other threads do it by submitting control work.
285 |          */
286 |         region_do_release(reg);
287 |         return;
288 |     }
289 | 
290 |     vhd_run_in_ctl(reap_regions_bh, NULL);
291 | }
292 | 
293 | static void region_ref(struct vhd_memory_region *reg)
294 | {
295 |     objref_get(&reg->ref);
296 | }
297 | 
298 | static void region_unref(struct vhd_memory_region *reg)
299 | {
300 |     objref_put(&reg->ref);
301 | }
302 | 
303 | static inline struct vhd_memory_region *region_get_cached(
304 |     uint64_t gpa, uint64_t uva,
305 |     size_t size, int fd,
306 |     off_t offset,
307 |     struct vhd_mmap_callbacks *callbacks,
308 |     bool preserve_fd
309 | )
310 | {
311 |     struct vhd_memory_region *region;
312 |     struct stat stat;
313 | 
314 |     if (fstat(fd, &stat) < 0) {
315 |         return NULL;
316 |     }
317 | 
318 |     LIST_FOREACH(region, &g_regions, region_link) {
319 |         if (region->inode != stat.st_ino || region->device != stat.st_dev) {
320 |             continue;
321 |         }
322 |         if (region->gpa != gpa || region->uva != uva ||
323 |             region->size != size || region->offset != offset) {
324 |             continue;
325 |         }
326 |         if (region->callbacks.map_cb != callbacks->map_cb ||
327 |             region->callbacks.unmap_cb != callbacks->unmap_cb) {
328 |             continue;
329 |         }
330 |         if (preserve_fd && region->fd == -1) {
331 |             continue;
332 |         }
333 | 
334 |         region_ref(region);
335 |         return region;
336 |     }
337 | 
338 |     return NULL;
339 | }
340 | 
341 | static void memmap_release(struct objref *objref)
342 | {
343 |     struct vhd_memory_map *mm =
344 |         containerof(objref, struct vhd_memory_map, ref);
345 |     unsigned i;
346 | 
347 |     for (i = 0; i < mm->num; i++) {
348 |         region_unref(mm->regions[i]);
349 |     }
350 | 
351 |     vhd_free(mm);
352 | }
353 | 
354 | void vhd_memmap_ref(struct vhd_memory_map *mm) __attribute__ ((weak));
355 | void vhd_memmap_ref(struct vhd_memory_map *mm)
356 | {
357 |     objref_get(&mm->ref);
358 | }
359 | 
360 | void vhd_memmap_unref(struct vhd_memory_map *mm) __attribute__ ((weak));
361 | void vhd_memmap_unref(struct vhd_memory_map *mm)
362 | {
363 |     objref_put(&mm->ref);
364 | }
365 | 
366 | uint64_t ptr_to_gpa(struct vhd_memory_map *mm, void *ptr)
367 | {
368 |     unsigned i;
369 |     for (i = 0; i < mm->num; ++i) {
370 |         struct vhd_memory_region *reg = mm->regions[i];
371 |         if (ptr >= reg->ptr && ptr < reg->ptr + reg->size) {
372 |             return (ptr - reg->ptr) + reg->gpa;
373 |         }
374 |     }
375 | 
376 |     VHD_LOG_WARN("Failed to translate ptr %p to gpa", ptr);
377 |     return TRANSLATION_FAILED;
378 | }
379 | 
380 | void *gpa_range_to_ptr(struct vhd_memory_map *mm,
381 |                        uint64_t gpa, size_t len) __attribute__ ((weak));
382 | void *gpa_range_to_ptr(struct vhd_memory_map *mm, uint64_t gpa, size_t len)
383 | {
384 |     unsigned i;
385 | 
386 |     for (i = 0; i < mm->num; i++) {
387 |         struct vhd_memory_region *reg = mm->regions[i];
388 |         if (gpa >= reg->gpa && gpa - reg->gpa < reg->size) {
389 |             /*
390 |              * Check (overflow-safe) that length fits in a single region.
391 |              *
392 |              * TODO: should we handle gpa areas that cross region boundaries
393 |              *       but are otherwise valid?
394 |              */
395 |             if (len > reg->size || gpa - reg->gpa + len > reg->size) {
396 |                 return NULL;
397 |             }
398 | 
399 |             return reg->ptr + (gpa - reg->gpa);
400 |         }
401 |     }
402 | 
403 |     return NULL;
404 | }
405 | 
406 | struct vhd_memory_map *vhd_memmap_new(int (*map_cb)(void *, size_t),
407 |                                       int (*unmap_cb)(void *, size_t))
408 | {
409 |     struct vhd_memory_map *mm = vhd_alloc(sizeof(*mm));
410 |     *mm = (struct vhd_memory_map) {
411 |         .callbacks = (struct vhd_mmap_callbacks) {
412 |             .map_cb = map_cb,
413 |             .unmap_cb = unmap_cb,
414 |         }
415 |     };
416 | 
417 |     objref_init(&mm->ref, memmap_release);
418 |     return mm;
419 | }
420 | 
421 | struct vhd_memory_map *vhd_memmap_dup(struct vhd_memory_map *mm)
422 | {
423 |     size_t i;
424 |     struct vhd_memory_map *new_mm = vhd_alloc(sizeof(*mm));
425 | 
426 |     new_mm->callbacks = mm->callbacks;
427 |     new_mm->num = mm->num;
428 |     objref_init(&new_mm->ref, memmap_release);
429 | 
430 |     for (i = 0; i < mm->num; i++) {
431 |         struct vhd_memory_region *reg = mm->regions[i];
432 |         region_ref(reg);
433 |         new_mm->regions[i] = reg;
434 |     }
435 | 
436 |     return new_mm;
437 | }
438 | 
439 | static int region_create(
440 |     uint64_t gpa, uint64_t uva, size_t size, int fd,
441 |     off_t offset, struct vhd_mmap_callbacks callbacks,
442 |     bool preserve_fd, struct vhd_memory_region **out_region)
443 | {
444 |     struct vhd_memory_region *region;
445 |     int ret;
446 | 
447 |     region = vhd_calloc(1, sizeof(*region));
448 |     *region = (struct vhd_memory_region) {
449 |         .callbacks = callbacks,
450 |     };
451 | 
452 |     objref_init(&region->ref, region_release);
453 | 
454 |     ret = map_region(region, gpa, uva, size, fd, offset, preserve_fd);
455 |     if (ret < 0) {
456 |         vhd_free(region);
457 |         return ret;
458 |     }
459 | 
460 |     LIST_INSERT_HEAD(&g_regions, region, region_link);
461 |     *out_region = region;
462 |     return 0;
463 | }
464 | 
465 | struct vhd_memory_map *vhd_memmap_dup_remap(struct vhd_memory_map *mm)
466 | {
467 |     int ret;
468 |     size_t i;
469 |     struct vhd_memory_map *new_mm;
470 | 
471 |     // Verify that the memmap was created with preserve_fd=true
472 |     for (i = 0; i < mm->num; i++) {
473 |         if (unlikely(mm->regions[i]->fd < 0)) {
474 |             VHD_LOG_ERROR("attempting to remap a memory map without preserved"
475 |                           " fds");
476 |             return NULL;
477 |         }
478 |     }
479 | 
480 |     new_mm = vhd_alloc(sizeof(*mm));
481 |     new_mm->callbacks = mm->callbacks;
482 |     new_mm->num = mm->num;
483 |     objref_init(&new_mm->ref, memmap_release);
484 | 
485 |     for (i = 0; i < mm->num; i++) {
486 |         struct vhd_memory_region *reg = mm->regions[i];
487 |         ret = region_create(reg->gpa, reg->uva, reg->size,
488 |                             reg->fd, reg->offset, mm->callbacks,
489 |                             true, &new_mm->regions[i]);
490 | 
491 |         if (unlikely(ret < 0)) {
492 |             while (i-- > 0) {
493 |                 region_unref(new_mm->regions[i]);
494 |             }
495 |             vhd_free(new_mm);
496 |             return NULL;
497 |         }
498 |     }
499 | 
500 |     return new_mm;
501 | }
502 | 
503 | int vhd_memmap_add_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva,
504 |                         size_t size, int fd, off_t offset, bool preserve_fd)
505 | {
506 |     int ret;
507 |     unsigned i;
508 |     struct vhd_memory_region *region;
509 | 
510 |     /* check for overflow */
511 |     if (gpa + size < gpa || uva + size < uva) {
512 |         return -EINVAL;
513 |     }
514 |     /* check for spare slots */
515 |     if (mm->num == VHD_RAM_SLOTS_MAX) {
516 |         return -ENOBUFS;
517 |     }
518 |     /* check for intersection with existing slots */
519 |     for (i = 0; i < mm->num; i++) {
520 |         struct vhd_memory_region *reg = mm->regions[i];
521 |         if (reg->gpa + reg->size <= gpa || gpa + size <= reg->gpa ||
522 |             reg->uva + reg->size <= uva || uva + size <= reg->uva) {
523 |             continue;
524 |         }
525 |         return -EINVAL;
526 |     }
527 | 
528 |     /* find appropriate position to keep ascending order in gpa */
529 |     for (i = mm->num; i > 0; i--) {
530 |         struct vhd_memory_region *reg = mm->regions[i - 1];
531 |         if (reg->gpa < gpa) {
532 |             break;
533 |         }
534 |     }
535 | 
536 |     region = region_get_cached(gpa, uva, size, fd, offset, &mm->callbacks,
537 |                                preserve_fd);
538 |     if (region == NULL) {
539 |         ret = region_create(gpa, uva, size, fd, offset, mm->callbacks,
540 |                             preserve_fd, &region);
541 |         if (ret < 0) {
542 |             return ret;
543 |         }
544 |     } else {
545 |         VHD_LOG_INFO(
546 |             "region %jd-%ju (GPA 0x%016"PRIX64" -> 0x%016"PRIX64") cache hit, "
547 |             "reusing (%u refs total)", region->device, region->inode,
548 |             region->gpa, region->gpa + region->size, objref_read(&region->ref)
549 |         );
550 |     }
551 | 
552 |     if (i < mm->num) {
553 |         memmove(&mm->regions[i + 1], &mm->regions[i],
554 |                 sizeof(mm->regions[0]) * (mm->num - i));
555 |     }
556 |     mm->regions[i] = region;
557 |     mm->num++;
558 | 
559 |     return 0;
560 | }
561 | 
562 | int vhd_memmap_del_slot(struct vhd_memory_map *mm, uint64_t gpa, uint64_t uva,
563 |                         size_t size)
564 | {
565 |     unsigned i;
566 | 
567 |     for (i = 0; i < mm->num; i++) {
568 |         struct vhd_memory_region *reg = mm->regions[i];
569 |         if (reg->gpa == gpa && reg->uva == uva && reg->size == size) {
570 |             break;
571 |         }
572 |     }
573 | 
574 |     if (i == mm->num) {
575 |         return -ENXIO;
576 |     }
577 | 
578 |     region_unref(mm->regions[i]);
579 | 
580 |     mm->num--;
581 |     if (i < mm->num) {
582 |         memmove(&mm->regions[i], &mm->regions[i + 1],
583 |                 sizeof(mm->regions[0]) * (mm->num - i));
584 |     }
585 | 
586 |     return 0;
587 | }
588 | 


--------------------------------------------------------------------------------
/event.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Based on QEMU's util/async.c
  3 |  *
  4 |  * Copyright (c) 2003-2008 Fabrice Bellard
  5 |  * Copyright (c) 2009-2017 QEMU contributors
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in
 15 |  * all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 20 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 |  * THE SOFTWARE.
 24 |  */
 25 | #include <errno.h>
 26 | #include <string.h>
 27 | #include <unistd.h>
 28 | #include <sys/epoll.h>
 29 | #include <sys/eventfd.h>
 30 | #include <semaphore.h>
 31 | 
 32 | #include "catomic.h"
 33 | #include "queue.h"
 34 | #include "platform.h"
 35 | #include "event.h"
 36 | #include "logging.h"
 37 | 
 38 | enum {
 39 |     /* Already enqueued and waiting for bh_poll() */
 40 |     BH_PENDING   = (1 << 0),
 41 | 
 42 |     /* Invoke the callback */
 43 |     BH_SCHEDULED = (1 << 1),
 44 | 
 45 |     /* Delete without invoking callback */
 46 |     BH_DELETED   = (1 << 2),
 47 | 
 48 |     /* Delete after invoking callback */
 49 |     BH_ONESHOT   = (1 << 3),
 50 | };
 51 | 
 52 | struct vhd_bh {
 53 |     struct vhd_event_loop *ctx;
 54 |     vhd_bh_cb *cb;
 55 |     void *opaque;
 56 |     SLIST_ENTRY(vhd_bh) next;
 57 |     unsigned flags;
 58 | };
 59 | 
 60 | typedef SLIST_HEAD(, vhd_bh) vhd_bh_list;
 61 | 
 62 | struct vhd_event_loop {
 63 |     int epollfd;
 64 | 
 65 |     /* eventfd we use to cancel epoll_wait if needed */
 66 |     int notifyfd;
 67 | 
 68 |     /* number of currently attached events (for consistency checks) */
 69 |     uint32_t num_events_attached;
 70 | 
 71 |     bool notified;
 72 | 
 73 |     /* vhd_terminate_event_loop has been completed */
 74 |     bool is_terminated;
 75 | 
 76 |     bool has_home_thread;
 77 | 
 78 |     /* preallocated events buffer */
 79 |     struct epoll_event *events;
 80 |     uint64_t max_events;
 81 | 
 82 |     vhd_bh_list bh_list;
 83 | 
 84 |     SLIST_HEAD(, vhd_io_handler) deleted_handlers;
 85 | };
 86 | 
 87 | static void evloop_notify(struct vhd_event_loop *evloop)
 88 | {
 89 |     if (!catomic_xchg(&evloop->notified, true)) {
 90 |         vhd_set_eventfd(evloop->notifyfd);
 91 |     }
 92 | }
 93 | 
 94 | static void notify_accept(struct vhd_event_loop *evloop)
 95 | {
 96 |     if (catomic_read(&evloop->notified)) {
 97 |         vhd_clear_eventfd(evloop->notifyfd);
 98 |         catomic_xchg(&evloop->notified, false);
 99 |     }
100 | }
101 | 
102 | /* called concurrently from any thread */
103 | static void bh_enqueue(struct vhd_bh *bh, unsigned new_flags)
104 | {
105 |     struct vhd_event_loop *ctx = bh->ctx;
106 |     unsigned old_flags;
107 | 
108 |     /*
109 |      * The memory barrier implicit in catomic_fetch_or makes sure that:
110 |      * 1. any writes needed by the callback are done before the locations are
111 |      *    read in the bh_poll.
112 |      * 2. ctx is loaded before the callback has a chance to execute and bh
113 |      *    could be freed.
114 |      * Paired with bh_dequeue().
115 |      */
116 |     old_flags = catomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
117 |     if (!(old_flags & BH_PENDING)) {
118 |         SLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
119 |     }
120 | 
121 |     evloop_notify(ctx);
122 | }
123 | 
124 | /* only called from bh_poll() and bh_cleanup() */
125 | static struct vhd_bh *bh_dequeue(vhd_bh_list *head, unsigned *flags)
126 | {
127 |     struct vhd_bh *bh = SLIST_FIRST_RCU(head);
128 | 
129 |     if (!bh) {
130 |         return NULL;
131 |     }
132 | 
133 |     SLIST_REMOVE_HEAD(head, next);
134 | 
135 |     /*
136 |      * The catomic_and is paired with bh_enqueue().  The implicit memory barrier
137 |      * ensures that the callback sees all writes done by the scheduling thread.
138 |      * It also ensures that the scheduling thread sees the cleared flag before
139 |      * bh->cb has run, and thus will call evloop_notify again if necessary.
140 |      */
141 |     *flags = catomic_fetch_and(&bh->flags, ~(BH_PENDING | BH_SCHEDULED));
142 |     return bh;
143 | }
144 | 
145 | struct vhd_bh *vhd_bh_new(struct vhd_event_loop *ctx,
146 |                           vhd_bh_cb *cb, void *opaque)
147 | {
148 |     struct vhd_bh *bh = vhd_alloc(sizeof(*bh));
149 |     *bh = (struct vhd_bh){
150 |         .ctx = ctx,
151 |         .cb = cb,
152 |         .opaque = opaque,
153 |     };
154 |     return bh;
155 | }
156 | 
157 | void vhd_bh_schedule_oneshot(struct vhd_event_loop *ctx,
158 |                              vhd_bh_cb *cb, void *opaque)
159 | {
160 |     struct vhd_bh *bh = vhd_bh_new(ctx, cb, opaque);
161 |     bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
162 | }
163 | 
164 | void vhd_bh_schedule(struct vhd_bh *bh)
165 | {
166 |     bh_enqueue(bh, BH_SCHEDULED);
167 | }
168 | 
169 | /* this is async and doesn't interfere with already running bh */
170 | void vhd_bh_cancel(struct vhd_bh *bh)
171 | {
172 |     catomic_and(&bh->flags, ~BH_SCHEDULED);
173 | }
174 | 
175 | /* this is async; deletion only happens in bh_poll, so need to enqueue first */
176 | void vhd_bh_delete(struct vhd_bh *bh)
177 | {
178 |     bh_enqueue(bh, BH_DELETED);
179 | }
180 | 
181 | 
182 | static void bh_call(struct vhd_bh *bh)
183 | {
184 |     bh->cb(bh->opaque);
185 | }
186 | 
187 | /*
188 |  * Execute bottom halves scheduled so far.  Return true if any progress has
189 |  * been made (i.e. any bh was executed).
190 |  * Multiple occurrences of bh_poll cannot be called concurrently.
191 |  */
192 | static bool bh_poll(struct vhd_event_loop *ctx)
193 | {
194 |     vhd_bh_list bh_list;
195 |     struct vhd_bh *bh;
196 |     unsigned flags;
197 |     bool ret = false;
198 | 
199 |     SLIST_INIT(&bh_list);
200 |     /* swap bh list from ctx for a fresh one */
201 |     SLIST_MOVE_ATOMIC(&bh_list, &ctx->bh_list);
202 | 
203 |     for (;;) {
204 |         bh = bh_dequeue(&bh_list, &flags);
205 |         if (!bh) {
206 |             break;
207 |         }
208 | 
209 |         if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
210 |             ret = true;
211 |             bh_call(bh);
212 |         }
213 | 
214 |         if (flags & (BH_DELETED | BH_ONESHOT)) {
215 |             vhd_free(bh);
216 |         }
217 |     }
218 | 
219 |     return ret;
220 | }
221 | 
222 | static void bh_cleanup(struct vhd_event_loop *ctx)
223 | {
224 |     struct vhd_bh *bh;
225 |     unsigned flags;
226 | 
227 |     for (;;) {
228 |         bh = bh_dequeue(&ctx->bh_list, &flags);
229 |         if (!bh) {
230 |             break;
231 |         }
232 | 
233 |         /* only deleted bhs may remain */
234 |         assert(flags & BH_DELETED);
235 |         vhd_free(bh);
236 |     }
237 | }
238 | 
239 | struct vhd_io_handler {
240 |     struct vhd_event_loop *evloop;
241 |     int (*read)(void *opaque);
242 |     /* FIXME: must really include write handler as well */
243 |     void *opaque;
244 |     int fd;
245 | 
246 |     bool attached;
247 |     SLIST_ENTRY(vhd_io_handler) deleted_entry;
248 | };
249 | 
250 | static int handle_one_event(struct vhd_io_handler *handler, int event_code)
251 | {
252 |     if ((event_code & (EPOLLIN | EPOLLERR | EPOLLRDHUP)) && handler->read) {
253 |         return handler->read(handler->opaque);
254 |     }
255 | 
256 |     return 0;
257 | }
258 | 
259 | static int handle_events(struct vhd_event_loop *evloop, int nevents)
260 | {
261 |     int nerr = 0;
262 |     struct epoll_event *events = evloop->events;
263 | 
264 |     for (int i = 0; i < nevents; i++) {
265 |         struct vhd_io_handler *handler = events[i].data.ptr;
266 |         /* event loop notifer doesn't use a handler */
267 |         if (!handler) {
268 |             continue;
269 |         }
270 |         /* don't call into detached handler even if it's on the ready list */
271 |         if (!handler->attached) {
272 |             continue;
273 |         }
274 |         if (handle_one_event(handler, events[i].events)) {
275 |             nerr++;
276 |         }
277 |     }
278 | 
279 |     /*
280 |      * The deleted handlers are detached and won't appear on the ready list any
281 |      * more, so it's now safe to actually delete them.
282 |      */
283 |     while (!SLIST_EMPTY(&evloop->deleted_handlers)) {
284 |         struct vhd_io_handler *handler =
285 |             SLIST_FIRST(&evloop->deleted_handlers);
286 |         SLIST_REMOVE_HEAD(&evloop->deleted_handlers, deleted_entry);
287 |         vhd_free(handler);
288 |     }
289 | 
290 |     return nerr;
291 | }
292 | 
293 | struct vhd_event_loop *vhd_create_event_loop(size_t max_events)
294 | {
295 |     int notifyfd;
296 |     int epollfd;
297 | 
298 |     epollfd = epoll_create1(EPOLL_CLOEXEC);
299 |     if (epollfd < 0) {
300 |         VHD_LOG_ERROR("epoll_create1: %s", strerror(errno));
301 |         return NULL;
302 |     }
303 | 
304 |     notifyfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
305 |     if (notifyfd < 0) {
306 |         VHD_LOG_ERROR("eventfd() failed: %s", strerror(errno));
307 |         goto close_epoll;
308 |     }
309 | 
310 |     /* Register notify eventfd, make sure it is level-triggered */
311 |     struct epoll_event ev = {
312 |         .events = EPOLLIN,
313 |     };
314 |     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, notifyfd, &ev) == -1) {
315 |         VHD_LOG_ERROR("epoll_ctl(EPOLL_CTL_ADD, notifyfd): %s",
316 |                       strerror(errno));
317 |         goto error_out;
318 |     }
319 | 
320 |     struct vhd_event_loop *evloop = vhd_alloc(sizeof(*evloop));
321 |     max_events++; /* +1 for notify eventfd */
322 |     *evloop = (struct vhd_event_loop) {
323 |         .epollfd = epollfd,
324 |         .notifyfd = notifyfd,
325 |         .max_events = max_events,
326 |         .events = vhd_calloc(sizeof(evloop->events[0]), max_events),
327 |     };
328 |     SLIST_INIT(&evloop->bh_list);
329 |     SLIST_INIT(&evloop->deleted_handlers);
330 | 
331 |     return evloop;
332 | 
333 | error_out:
334 |     close(notifyfd);
335 | close_epoll:
336 |     close(epollfd);
337 |     return NULL;
338 | }
339 | 
340 | static __thread struct vhd_event_loop *home_evloop;
341 | 
342 | int vhd_run_event_loop(struct vhd_event_loop *evloop, int timeout_ms)
343 | {
344 |     if (!home_evloop) {
345 |         bool had_home_thread = catomic_xchg(&evloop->has_home_thread, true);
346 |         VHD_VERIFY(!had_home_thread);
347 |         home_evloop = evloop;
348 |     }
349 |     VHD_ASSERT(evloop == home_evloop);
350 | 
351 |     if (evloop->is_terminated) {
352 |         return 0;
353 |     }
354 | 
355 |     int nev = epoll_wait(evloop->epollfd, evloop->events, evloop->max_events,
356 |                          timeout_ms);
357 |     if (!nev) {
358 |         return -EAGAIN;
359 |     } else if (nev < 0) {
360 |         int ret = -errno;
361 |         if (ret == -EINTR) {
362 |             return -EAGAIN;
363 |         }
364 | 
365 |         VHD_LOG_ERROR("epoll_wait internal error: %s", strerror(-ret));
366 |         return ret;
367 |     }
368 | 
369 |     notify_accept(evloop);
370 |     bh_poll(evloop);
371 | 
372 |     int nerr = handle_events(evloop, nev);
373 |     if (nerr) {
374 |         VHD_LOG_WARN("Got %d events, can't handle %d events", nev, nerr);
375 |         return -EIO;
376 |     }
377 | 
378 |     return -EAGAIN;
379 | }
380 | 
381 | static void evloop_stop_bh(void *opaque)
382 | {
383 |     struct vhd_event_loop *evloop = opaque;
384 |     evloop->is_terminated = true;
385 | }
386 | 
387 | void vhd_terminate_event_loop(struct vhd_event_loop *evloop)
388 | {
389 |     vhd_bh_schedule_oneshot(evloop, evloop_stop_bh, evloop);
390 | }
391 | 
392 | /*
393 |  * Only free the event loop when there's no concurrent access to it.  One way
394 |  * to do it is to do free at the end of the thread running the event loop.
395 |  * Another is to wait for the thread running the event loop to terminate (to
396 |  * join it) and only do free afterwards.
397 |  */
398 | void vhd_free_event_loop(struct vhd_event_loop *evloop)
399 | {
400 |     VHD_ASSERT(evloop->is_terminated);
401 |     VHD_ASSERT(evloop->num_events_attached == 0);
402 |     bh_cleanup(evloop);
403 |     close(evloop->epollfd);
404 |     close(evloop->notifyfd);
405 |     vhd_free(evloop->events);
406 |     vhd_free(evloop);
407 | }
408 | 
409 | static void event_loop_inc_events(struct vhd_event_loop *evloop)
410 | {
411 |     evloop->num_events_attached++;
412 | }
413 | 
414 | static void event_loop_dec_events(struct vhd_event_loop *evloop)
415 | {
416 |     VHD_ASSERT(evloop->num_events_attached > 0);
417 |     evloop->num_events_attached--;
418 | }
419 | 
420 | int vhd_attach_io_handler(struct vhd_io_handler *handler)
421 | {
422 |     struct vhd_event_loop *evloop = handler->evloop;
423 |     int fd = handler->fd;
424 | 
425 |     struct epoll_event ev = {
426 |         .events = EPOLLIN | EPOLLHUP | EPOLLRDHUP,
427 |         .data.ptr = handler
428 |     };
429 | 
430 |     /* to maintain fields consistency only do this in the home event loop */
431 |     VHD_ASSERT(evloop == home_evloop);
432 | 
433 |     /* unlike detach, multiple attachment is a logic error */
434 |     VHD_ASSERT(!handler->attached);
435 | 
436 |     if (epoll_ctl(evloop->epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
437 |         int ret = -errno;
438 |         VHD_LOG_ERROR("Can't add event: %s", strerror(-ret));
439 |         return ret;
440 |     }
441 | 
442 |     handler->attached = true;
443 | 
444 |     return 0;
445 | }
446 | 
447 | struct vhd_io_handler *vhd_add_io_handler(struct vhd_event_loop *evloop,
448 |                                           int fd, int (*read)(void *opaque),
449 |                                           void *opaque)
450 | {
451 |     struct vhd_io_handler *handler;
452 | 
453 |     handler = vhd_alloc(sizeof(*handler));
454 |     *handler = (struct vhd_io_handler) {
455 |             .evloop = evloop,
456 |             .fd = fd,
457 |             .read = read,
458 |             .opaque = opaque
459 |     };
460 | 
461 |     if (vhd_attach_io_handler(handler) < 0) {
462 |         goto fail;
463 |     }
464 | 
465 |     event_loop_inc_events(evloop);
466 |     return handler;
467 | fail:
468 |     vhd_free(handler);
469 |     return NULL;
470 | }
471 | 
472 | int vhd_detach_io_handler(struct vhd_io_handler *handler)
473 | {
474 |     struct vhd_event_loop *evloop = handler->evloop;
475 | 
476 |     /* to maintain fields consistency only do this in the home event loop */
477 |     VHD_ASSERT(evloop == home_evloop);
478 | 
479 |     if (!handler->attached) {
480 |         return 0;
481 |     }
482 | 
483 |     if (epoll_ctl(evloop->epollfd, EPOLL_CTL_DEL, handler->fd, NULL) < 0) {
484 |         int ret = -errno;
485 |         VHD_LOG_ERROR("Can't delete event: %s", strerror(-ret));
486 |         return ret;
487 |     }
488 | 
489 |     /*
490 |      * The file descriptor being detached may still be sitting on the ready
491 |      * list returned by epoll_wait.
492 |      * Make sure the handler for it isn't called.
493 |      */
494 |     handler->attached = false;
495 | 
496 |     return 0;
497 | }
498 | 
499 | int vhd_del_io_handler(struct vhd_io_handler *handler)
500 | {
501 |     int ret;
502 |     struct vhd_event_loop *evloop = handler->evloop;
503 | 
504 |     ret = vhd_detach_io_handler(handler);
505 |     if (ret < 0) {
506 |         return ret;
507 |     }
508 | 
509 |     /*
510 |      * The file descriptor being deleted may still be sitting on the ready list
511 |      * returned by epoll_wait.
512 |      * Schedule it for deallocation at the end of the iteration after the ready
513 |      * event list processing is through.
514 |      */
515 |     SLIST_INSERT_HEAD(&evloop->deleted_handlers, handler, deleted_entry);
516 | 
517 |     event_loop_dec_events(evloop);
518 |     return 0;
519 | }
520 | 
521 | void vhd_clear_eventfd(int fd)
522 | {
523 |     eventfd_t unused;
524 |     while (eventfd_read(fd, &unused) && errno == EINTR) {
525 |         ;
526 |     }
527 | }
528 | 
529 | void vhd_set_eventfd(int fd)
530 | {
531 |     while (eventfd_write(fd, 1) && errno == EINTR) {
532 |         ;
533 |     }
534 | }
535 | 
536 | struct vhd_work {
537 |     void (*func)(struct vhd_work *, void *);
538 |     void *opaque;
539 |     int ret;
540 |     sem_t wait;
541 | };
542 | 
543 | void vhd_complete_work(struct vhd_work *work, int ret)
544 | {
545 |     work->ret = ret;
546 |     /*
547 |      * sem_post is a full memory barrier so the vhd_submit_work_and_wait will
548 |      * see ->ret set above
549 |      */
550 |     if (sem_post(&work->wait) < 0) {
551 |         /* log an error and continue as there's no better strategy */
552 |         VHD_LOG_ERROR("sem_post: %s", strerror(errno));
553 |     }
554 | }
555 | 
556 | static void work_bh(void *opaque)
557 | {
558 |     struct vhd_work *work = opaque;
559 |     work->func(work, work->opaque);
560 | }
561 | 
562 | int vhd_submit_work_and_wait(struct vhd_event_loop *evloop,
563 |                              void (*func)(struct vhd_work *, void *),
564 |                              void *opaque)
565 | {
566 |     int ret;
567 |     struct vhd_work work = {
568 |         .func = func,
569 |         .opaque = opaque,
570 |     };
571 | 
572 |     /* waiting for completion in the same event loop would deadlock */
573 |     VHD_ASSERT(evloop != home_evloop);
574 | 
575 |     /* sem_init can't fail when both arguments are zero */
576 |     ret = sem_init(&work.wait, 0, 0);
577 |     VHD_ASSERT(ret == 0);
578 | 
579 |     vhd_bh_schedule_oneshot(evloop, work_bh, &work);
580 | 
581 |     /*
582 |      * sem_wait may fail with either EINTR (we handle it) or EINVAL when
583 |      * called on invalid pointer, which is impossible here.
584 |      */
585 |     do {
586 |         ret = sem_wait(&work.wait);
587 |     } while (ret < 0 && errno == EINTR);
588 |     VHD_ASSERT(ret == 0);
589 | 
590 |     /*
591 |      * sem_destroy may fail only with EINVAL when called on invalid pointer,
592 |      * which is impossible here.
593 |      */
594 |     ret = sem_destroy(&work.wait);
595 |     VHD_ASSERT(ret == 0);
596 | 
597 |     /*
598 |      * sem_wait is a full memory barrier so this is the ->ret set in
599 |      * vhd_complete_work
600 |      */
601 |     return work.ret;
602 | }
603 | 


--------------------------------------------------------------------------------
/virtio/virtio_blk.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <inttypes.h>
  3 | 
  4 | #include "vhost/blockdev.h"
  5 | 
  6 | #include "virtio_blk.h"
  7 | #include "virtio_blk_spec.h"
  8 | 
  9 | #include "bio.h"
 10 | #include "virt_queue.h"
 11 | #include "logging.h"
 12 | #include "server_internal.h"
 13 | #include "vdev.h"
 14 | 
 15 | /* virtio blk data for bdev io */
 16 | struct virtio_blk_io {
 17 |     struct virtio_virtq *vq;
 18 |     struct virtio_iov *iov;
 19 | 
 20 |     struct vhd_io io;
 21 |     struct vhd_bdev_io bdev_io;
 22 | };
 23 | 
 24 | static size_t iov_size(const struct vhd_buffer *iov, unsigned niov)
 25 | {
 26 |     size_t len;
 27 |     unsigned int i;
 28 | 
 29 |     len = 0;
 30 |     for (i = 0; i < niov; i++) {
 31 |         len += iov[i].len;
 32 |     }
 33 |     return len;
 34 | }
 35 | 
 36 | static uint8_t translate_status(enum vhd_bdev_io_result status)
 37 | {
 38 |     switch (status) {
 39 |     case VHD_BDEV_SUCCESS:
 40 |         return VIRTIO_BLK_S_OK;
 41 |     default:
 42 |         return VIRTIO_BLK_S_IOERR;
 43 |     }
 44 | }
 45 | 
 46 | static void set_status(struct virtio_iov *iov, uint8_t status)
 47 | {
 48 |     struct vhd_buffer *last_iov = &iov->iov_in[iov->niov_in - 1];
 49 |     *((uint8_t *)last_iov->base) = status;
 50 | }
 51 | 
 52 | static void complete_req(struct vhd_vdev *vdev, struct virtio_virtq *vq,
 53 |                          struct virtio_iov *iov, uint8_t status)
 54 | {
 55 |     size_t in_size;
 56 | 
 57 |     set_status(iov, status);
 58 |     /*
 59 |      * the last byte in the IN buffer is always written (for status), so pass
 60 |      * the total length of the IN buffer to virtq_push()
 61 |      */
 62 |     in_size = iov_size(iov->iov_in, iov->niov_in);
 63 |     virtq_push(vq, iov, in_size);
 64 | 
 65 |     if (status == VHD_BDEV_SUCCESS && vdev != NULL &&
 66 |         vdev->pte_flush_byte_threshold) {
 67 |         size_t out_size;
 68 | 
 69 |         out_size = iov_size(iov->iov_out, iov->niov_out);
 70 |         catomic_sub(&vdev->bytes_left_before_pte_flush, in_size + out_size);
 71 |     }
 72 | 
 73 |     virtio_free_iov(iov);
 74 | }
 75 | 
 76 | static void complete_io(struct vhd_io *io)
 77 | {
 78 |     struct virtio_blk_io *bio = containerof(io, struct virtio_blk_io, io);
 79 | 
 80 |     if (likely(bio->io.status != VHD_BDEV_CANCELED)) {
 81 |         complete_req(io->vring->vdev, bio->vq, bio->iov,
 82 |                      translate_status(bio->io.status));
 83 |     } else {
 84 |         virtio_free_iov(bio->iov);
 85 |     }
 86 | 
 87 |     vhd_free(bio);
 88 | }
 89 | 
 90 | static bool is_valid_block_range_req(uint64_t sector, size_t nsectors,
 91 |                                      uint64_t capacity)
 92 | {
 93 |     if (nsectors > capacity || sector > capacity - nsectors) {
 94 |         VHD_LOG_ERROR("Request (%" PRIu64 "s, +%zus) spans"
 95 |                       " beyond device capacity %" PRIu64,
 96 |                       sector, nsectors, capacity);
 97 |         return false;
 98 |     }
 99 | 
100 |     return true;
101 | }
102 | 
103 | static bool is_valid_req(uint64_t sector, size_t len, uint64_t capacity)
104 | {
105 |     size_t nsectors = len / VIRTIO_BLK_SECTOR_SIZE;
106 | 
107 |     if (len == 0) {
108 |         VHD_LOG_ERROR("Zero size request");
109 |         return false;
110 |     }
111 |     if (len % VIRTIO_BLK_SECTOR_SIZE) {
112 |         VHD_LOG_ERROR("Request length %zu"
113 |                       " is not a multiple of sector size %u",
114 |                       len, VIRTIO_BLK_SECTOR_SIZE);
115 |         return false;
116 |     }
117 | 
118 |     return is_valid_block_range_req(sector, nsectors, capacity);
119 | }
120 | 
121 | static bool bio_submit(struct virtio_blk_io *bio)
122 | {
123 |     int res = virtio_blk_handle_request(bio->vq, &bio->io);
124 |     if (res != 0) {
125 |         VHD_LOG_ERROR("bdev request submission failed with %d", res);
126 |         vhd_free(bio);
127 |         return false;
128 |     }
129 | 
130 |     return true;
131 | }
132 | 
133 | static void handle_inout(struct virtio_blk_dev *dev,
134 |                          struct virtio_blk_req_hdr *req,
135 |                          struct virtio_virtq *vq,
136 |                          struct virtio_iov *iov)
137 | {
138 |     size_t len;
139 |     uint16_t ndatabufs;
140 |     struct vhd_buffer *pdata;
141 |     enum vhd_bdev_io_type io_type;
142 | 
143 |     if (req->type == VIRTIO_BLK_T_IN) {
144 |         io_type = VHD_BDEV_READ;
145 |         pdata = &iov->iov_in[0];
146 |         ndatabufs = iov->niov_in - 1;
147 |     } else {
148 |         if (virtio_blk_is_readonly(dev)) {
149 |             VHD_LOG_ERROR("Write request to readonly device");
150 |             goto fail_request;
151 |         }
152 |         io_type = VHD_BDEV_WRITE;
153 |         pdata = &iov->iov_out[1];
154 |         ndatabufs = iov->niov_out - 1;
155 |     }
156 | 
157 |     len = iov_size(pdata, ndatabufs);
158 | 
159 |     if (!is_valid_req(req->sector, len, dev->config.capacity)) {
160 |         goto fail_request;
161 |     }
162 | 
163 |     struct virtio_blk_io *bio = vhd_zalloc(sizeof(*bio));
164 |     bio->vq = vq;
165 |     bio->iov = iov;
166 |     bio->io.completion_handler = complete_io;
167 | 
168 |     bio->bdev_io.type = io_type;
169 |     /*
170 |      * bdev_io fields must be in VHD_SECTOR_SIZE, but it's equal to
171 |      * VIRTIO_SECTOR_SIZE, which is 512 bytes. We assert that below
172 |      * in virtio_blk_init_dev()
173 |      */
174 |     bio->bdev_io.first_sector = req->sector;
175 |     bio->bdev_io.total_sectors = len / VIRTIO_BLK_SECTOR_SIZE;
176 |     bio->bdev_io.sglist.nbuffers = ndatabufs;
177 |     bio->bdev_io.sglist.buffers = pdata;
178 | 
179 |     if (!bio_submit(bio)) {
180 |         goto fail_request;
181 |     }
182 | 
183 |     /* request will be completed asynchronously */
184 |     return;
185 | 
186 | fail_request:
187 |     complete_req(NULL, vq, iov, VIRTIO_BLK_S_IOERR);
188 | }
189 | 
190 | static void handle_discard_or_write_zeroes(struct virtio_blk_dev *dev,
191 |                                            le32 type,
192 |                                            struct virtio_virtq *vq,
193 |                                            struct virtio_iov *iov)
194 | {
195 |     struct virtio_blk_discard_write_zeroes seg;
196 |     struct virtio_blk_io *bio;
197 |     enum vhd_bdev_io_type io_type;
198 |     le32 max_sectors;
199 |     bool is_discard = type == VIRTIO_BLK_T_DISCARD;
200 |     const char *type_str = is_discard ? "discard" : "write-zeroes";
201 |     VHD_ASSERT(is_discard || type == VIRTIO_BLK_T_WRITE_ZEROES);
202 | 
203 |     if (virtio_blk_is_readonly(dev)) {
204 |         VHD_LOG_ERROR("%s request to readonly device", type_str);
205 |         goto fail_request;
206 |     }
207 | 
208 |     /*
209 |      * The data used for discard, secure erase or write zeroes commands
210 |      * consists of one or more segments. We support only one at the moment.
211 |      */
212 |     if (iov->niov_out != 2) {
213 |         VHD_LOG_ERROR("Invalid number of segments for a "
214 |                       "%s request %"PRIu16,
215 |                       type_str, iov->niov_out);
216 |         goto fail_request;
217 |     }
218 | 
219 |     if (iov->iov_out[1].len != sizeof(seg)) {
220 |         VHD_LOG_ERROR("Invalid %s segment size: "
221 |                       "expected %zu, got %zu!", type_str,
222 |                       sizeof(seg), iov->iov_out[1].len);
223 |         goto fail_request;
224 |     }
225 | 
226 |     memcpy(&seg, iov->iov_out[1].base, sizeof(seg));
227 |     if (!is_valid_block_range_req(seg.sector, seg.num_sectors,
228 |                                   dev->config.capacity)) {
229 |         goto fail_request;
230 |     }
231 | 
232 |     if (is_discard) {
233 |         le32 alignment = dev->config.discard_sector_alignment;
234 | 
235 |         if (!VHD_IS_ALIGNED(seg.num_sectors, alignment)) {
236 |             VHD_LOG_ERROR("Discard request sector count %"PRIu32
237 |                           " not aligned to %"PRIu32,
238 |                           seg.num_sectors, alignment);
239 |             goto fail_request;
240 |         }
241 | 
242 |         if (!VHD_IS_ALIGNED(seg.sector, alignment)) {
243 |             VHD_LOG_ERROR("Discard request sector %"PRIu64
244 |                           " not aligned to %"PRIu32,
245 |                           seg.sector, alignment);
246 |             goto fail_request;
247 |         }
248 | 
249 |         io_type = VHD_BDEV_DISCARD;
250 |         max_sectors = dev->config.max_discard_sectors;
251 |     } else {
252 |         io_type = VHD_BDEV_WRITE_ZEROES;
253 |         max_sectors = dev->config.max_write_zeroes_sectors;
254 |     }
255 | 
256 |     if (seg.num_sectors > max_sectors) {
257 |         VHD_LOG_ERROR("%s request too large: "
258 |                       "%"PRIu32" (max is %"PRIu32")",
259 |                       type_str, seg.num_sectors, max_sectors);
260 |         goto fail_request;
261 |     }
262 | 
263 |     bio = vhd_zalloc(sizeof(*bio));
264 |     bio->vq = vq;
265 |     bio->iov = iov;
266 |     bio->io.completion_handler = complete_io;
267 |     bio->bdev_io.type = io_type;
268 |     /*
269 |      * bdev_io fields must be in VHD_SECTOR_SIZE, but it's equal to
270 |      * VIRTIO_SECTOR_SIZE, which is 512 bytes. We assert that below
271 |      * in virtio_blk_init_dev()
272 |      */
273 |     bio->bdev_io.first_sector = seg.sector;
274 |     bio->bdev_io.total_sectors = seg.num_sectors;
275 | 
276 |     if (!bio_submit(bio)) {
277 |         goto fail_request;
278 |     }
279 | 
280 |     /* request will be completed asynchronously */
281 |     return;
282 | 
283 | fail_request:
284 |     complete_req(NULL, vq, iov, VIRTIO_BLK_S_IOERR);
285 | }
286 | 
287 | static uint8_t handle_getid(struct virtio_blk_dev *dev,
288 |                             struct virtio_iov *iov)
289 | {
290 |     if (iov->niov_in != 2) {
291 |         VHD_LOG_ERROR("Bad number of IN segments %u in request", iov->niov_in);
292 |         return VIRTIO_BLK_S_IOERR;
293 |     }
294 | 
295 |     struct vhd_buffer *id_buf = &iov->iov_in[0];
296 | 
297 |     if (id_buf->len != VIRTIO_BLK_DISKID_LENGTH) {
298 |         VHD_LOG_ERROR("Bad id buffer (len %zu)", id_buf->len);
299 |         return VIRTIO_BLK_S_IOERR;
300 |     }
301 | 
302 |     /*
303 |      * strncpy will not add a null-term if src length is >= desc->len, which is
304 |      * what we need
305 |      */
306 |     strncpy((char *) id_buf->base, dev->serial, id_buf->len);
307 | 
308 |     return VIRTIO_BLK_S_OK;
309 | }
310 | 
311 | static bool dev_supports_req(struct virtio_blk_dev *dev, le32 type)
312 | {
313 |     int feature;
314 | 
315 |     switch (type) {
316 |     case VIRTIO_BLK_T_IN:
317 |     case VIRTIO_BLK_T_OUT:
318 |     case VIRTIO_BLK_T_GET_ID:
319 |         return true;
320 |     case VIRTIO_BLK_T_DISCARD:
321 |         feature = VIRTIO_BLK_F_DISCARD;
322 |         break;
323 |     case VIRTIO_BLK_T_WRITE_ZEROES:
324 |         feature = VIRTIO_BLK_F_WRITE_ZEROES;
325 |         break;
326 |     default:
327 |         return false;
328 |     }
329 | 
330 |     return virtio_blk_has_feature(dev, feature);
331 | }
332 | 
333 | static void handle_buffers(void *arg, struct virtio_virtq *vq,
334 |                            struct virtio_iov *iov)
335 | {
336 |     uint8_t status;
337 |     struct virtio_blk_dev *dev = arg;
338 |     struct virtio_blk_req_hdr *req;
339 |     le32 type;
340 | 
341 |     /*
342 |      * Assume legacy message framing without VIRTIO_F_ANY_LAYOUT:
343 |      * - one 16-byte device-readable segment for header
344 |      * - data segments
345 |      * - one 1-byte device-writable segment for status
346 |      * FIXME: get rid of this assumption and support VIRTIO_F_ANY_LAYOUT
347 |      */
348 | 
349 |     if (!iov->niov_in || iov->iov_in[iov->niov_in - 1].len != 1) {
350 |         VHD_LOG_ERROR("No room for status response in the request");
351 |         abort_request(vq, iov);
352 |         return;
353 |     }
354 | 
355 |     if (!iov->niov_out || iov->iov_out[0].len != sizeof(*req)) {
356 |         VHD_LOG_ERROR("Malformed request header");
357 |         abort_request(vq, iov);
358 |         return;
359 |     }
360 | 
361 |     req = iov->iov_out[0].base;
362 |     type = req->type;
363 | 
364 |     if (!dev_supports_req(dev, type)) {
365 |         VHD_LOG_WARN("Unknown or unsupported request type %"PRIu32, type);
366 |         status = VIRTIO_BLK_S_UNSUPP;
367 |         goto out;
368 |     }
369 | 
370 |     switch (type) {
371 |     case VIRTIO_BLK_T_IN:
372 |     case VIRTIO_BLK_T_OUT:
373 |         handle_inout(dev, req, vq, iov);
374 |         return;         /* async completion */
375 |     case VIRTIO_BLK_T_GET_ID:
376 |         status = handle_getid(dev, iov);
377 |         break;
378 |     case VIRTIO_BLK_T_DISCARD:
379 |     case VIRTIO_BLK_T_WRITE_ZEROES:
380 |         handle_discard_or_write_zeroes(dev, type, vq, iov);
381 |         return;         /* async completion */
382 |     default:  /* unreachable because of dev_supports_req() */
383 |         VHD_UNREACHABLE();
384 |     };
385 | 
386 | out:
387 |     complete_req(NULL, vq, iov, status);
388 | }
389 | 
390 | /*////////////////////////////////////////////////////////////////////////////*/
391 | 
392 | int virtio_blk_dispatch_requests(struct virtio_blk_dev *dev,
393 |                                  struct virtio_virtq *vq)
394 | {
395 |     return virtq_dequeue_many(vq, handle_buffers, dev);
396 | }
397 | 
398 | __attribute__((weak))
399 | int virtio_blk_handle_request(struct virtio_virtq *vq, struct vhd_io *io)
400 | {
401 |     io->vring = VHD_VRING_FROM_VQ(vq);
402 |     return vhd_enqueue_request(vhd_get_rq_for_vring(io->vring), io);
403 | }
404 | 
405 | size_t virtio_blk_get_config(struct virtio_blk_dev *dev, void *cfgbuf,
406 |                              size_t bufsize, size_t offset)
407 | {
408 |     if (offset >= sizeof(dev->config)) {
409 |         return 0;
410 |     }
411 | 
412 |     size_t data_size = MIN(bufsize, sizeof(dev->config) - offset);
413 | 
414 |     memcpy(cfgbuf, (char *)(&dev->config) + offset, data_size);
415 | 
416 |     return data_size;
417 | }
418 | 
419 | uint64_t virtio_blk_get_features(struct virtio_blk_dev *dev)
420 | {
421 |     return dev->features;
422 | }
423 | 
424 | bool virtio_blk_has_feature(struct virtio_blk_dev *dev, int feature)
425 | {
426 |     const uint64_t mask = 1ull << feature;
427 |     return (virtio_blk_get_features(dev) & mask) == mask;
428 | }
429 | 
430 | bool virtio_blk_is_readonly(struct virtio_blk_dev *dev)
431 | {
432 |     return virtio_blk_has_feature(dev, VIRTIO_BLK_F_RO);
433 | }
434 | 
435 | static void refresh_config_geometry(struct virtio_blk_config *config)
436 | {
437 |     /*
438 |      * Here we use same max values like we did for blockstor-plugin.
439 |      * But it seems that the real world max values are:
440 |      */
441 |     /* 63 for sectors */
442 |     const uint8_t max_sectors = 255;
443 |     /* 16 for heads */
444 |     const uint8_t max_heads = 255;
445 |     /* 16383 for cylinders */
446 |     const uint16_t max_cylinders = 65535;
447 | 
448 |     config->geometry.sectors = MIN(config->capacity, max_sectors);
449 |     config->geometry.heads =
450 |         MIN(1 + (config->capacity - 1) / max_sectors, max_heads);
451 |     config->geometry.cylinders =
452 |         MIN(1 + (config->capacity - 1) / (max_sectors * max_heads),
453 |             max_cylinders);
454 | }
455 | 
456 | uint64_t virtio_blk_get_total_blocks(struct virtio_blk_dev *dev)
457 | {
458 |     return dev->config.capacity >> dev->config.topology.physical_block_exp;
459 | }
460 | 
461 | void virtio_blk_set_total_blocks(struct virtio_blk_dev *dev,
462 |                                  uint64_t total_blocks)
463 | {
464 |     uint64_t new_capacity =
465 |         total_blocks << dev->config.topology.physical_block_exp;
466 | 
467 |     if (new_capacity > dev->config.capacity) {
468 |         VHD_LOG_INFO("virtio-blk resize: %" PRIu64 " -> %" PRIu64,
469 |                      dev->config.capacity, new_capacity);
470 |     } else {
471 |         VHD_LOG_WARN("virtio-blk resize not increasing: %"
472 |                      PRIu64 " -> %" PRIu64,
473 |                      dev->config.capacity, new_capacity);
474 |     }
475 | 
476 |     dev->config.capacity = new_capacity;
477 |     refresh_config_geometry(&dev->config);
478 | }
479 | 
480 | void virtio_blk_init_dev(
481 |     struct virtio_blk_dev *dev,
482 |     const struct vhd_bdev_info *bdev)
483 | {
484 |     uint32_t sector_size;
485 | 
486 |     dev->serial = vhd_strdup(bdev->serial);
487 | 
488 |     dev->features = VIRTIO_BLK_DEFAULT_FEATURES;
489 |     if (vhd_blockdev_is_readonly(bdev)) {
490 |         dev->features |= (1ull << VIRTIO_BLK_F_RO);
491 |     }
492 |     if (vhd_blockdev_has_discard(bdev)) {
493 |         dev->features |= (1ull << VIRTIO_BLK_F_DISCARD);
494 |     }
495 |     if (vhd_blockdev_has_write_zeroes(bdev)) {
496 |         dev->features |= (1ull << VIRTIO_BLK_F_WRITE_ZEROES);
497 |     }
498 | 
499 |     /*
500 |      * Both virtio and block backend use the same sector size of 512.  Don't
501 |      * bother converting between the two, just assert they are the same.
502 |      */
503 |     VHD_STATIC_ASSERT(VHD_SECTOR_SIZE == VIRTIO_BLK_SECTOR_SIZE);
504 | 
505 |     /* capacity in 512 byte virtio sectors */
506 |     dev->config.capacity =
507 |         (bdev->total_blocks * bdev->block_size) / VIRTIO_BLK_SECTOR_SIZE;
508 | 
509 |     sector_size = vhd_blockdev_sector_size(bdev);
510 | 
511 |     /* blk_size in bytes, aka "logical-block" */
512 |     dev->config.blk_size = sector_size;
513 |     dev->config.numqueues = bdev->num_queues;
514 | 
515 |     /* # of logical blocks per physical block (log2) */
516 |     dev->config.topology.physical_block_exp =
517 |         vhd_find_first_bit32(bdev->block_size / sector_size);
518 | 
519 |     dev->config.topology.alignment_offset = 0;
520 |     /* TODO: can get that from bdev info */
521 |     dev->config.topology.min_io_size = 1;
522 | 
523 |     /* opt_io_size in blk_size chunks (logical blocks) */
524 |     dev->config.topology.opt_io_size =
525 |         bdev->optimal_io_size / sector_size;
526 | 
527 |     /* discard_sector_alignment in 512-bytes virtio sectors */
528 |     dev->config.discard_sector_alignment =
529 |         sector_size / VIRTIO_BLK_SECTOR_SIZE;
530 | 
531 |     dev->config.max_discard_sectors = VIRTIO_BLK_MAX_DISCARD_SECTORS;
532 |     dev->config.max_discard_seg = VIRTIO_BLK_MAX_DISCARD_SEGMENTS;
533 | 
534 |     dev->config.max_write_zeroes_sectors = VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS;
535 |     dev->config.max_write_zeroes_seg = VIRTIO_BLK_MAX_WRITE_ZEROES_SEGMENTS;
536 |     /*
537 |      * Since we don't know anything about the user of the library beforehand
538 |      * assume we _may_ unmap the sectors on write-zeroes.
539 |      * TODO: maybe propagate this value from blockdev config at creation time?
540 |      */
541 |     dev->config.write_zeroes_may_unmap = 1;
542 | 
543 |     /*
544 |      * Hardcode seg_max to 126. The same way like it's done for virtio-blk in
545 |      * qemu 2.12 which is used by blockstor-plugin.
546 |      * Although, this is an error prone approch which leads to the problems
547 |      * when queue size != 128
548 |      * (see https://www.mail-archive.com/qemu-devel@nongnu.org/msg668144.html)
549 |      * we have to use it to provide migration compatibility between virtio-blk
550 |      * and vhost-user-blk in both directions.
551 |      */
552 |     dev->config.seg_max = 128 - 2;
553 | 
554 |     refresh_config_geometry(&dev->config);
555 | }
556 | 
557 | void virtio_blk_destroy_dev(struct virtio_blk_dev *dev)
558 | {
559 |     vhd_free(dev->serial);
560 |     dev->serial = NULL;
561 | }
562 | 
563 | struct vhd_bdev_io *vhd_get_bdev_io(struct vhd_io *io)
564 | {
565 |     struct virtio_blk_io *bio = containerof(io, struct virtio_blk_io, io);
566 |     return &bio->bdev_io;
567 | }
568 | 


--------------------------------------------------------------------------------
/virtio/virt_queue.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <unistd.h>
  4 | #include <alloca.h>
  5 | #include <sys/eventfd.h>
  6 | #include <time.h>
  7 | #include <inttypes.h>
  8 | 
  9 | #include "catomic.h"
 10 | #include "virt_queue.h"
 11 | #include "logging.h"
 12 | #include "memmap.h"
 13 | #include "memlog.h"
 14 | 
 15 | /**
 16 |  * Holds private virtq data together with iovs we show users
 17 |  */
 18 | struct virtq_iov_private {
 19 |     /* Private virtq fields */
 20 |     uint16_t used_head;
 21 |     struct vhd_memory_map *mm;
 22 | 
 23 |     /* Iov we show to caller */
 24 |     struct virtio_iov iov;
 25 | };
 26 | 
 27 | static inline uint16_t virtq_get_used_event(struct virtio_virtq *vq)
 28 | {
 29 |     return vq->avail->ring[vq->qsz];
 30 | }
 31 | 
 32 | static inline void virtq_set_avail_event(struct virtio_virtq *vq,
 33 |                                          uint16_t avail_idx)
 34 | {
 35 |     *(le16 *)&vq->used->ring[vq->qsz] = avail_idx;
 36 | }
 37 | 
 38 | static int virtq_dequeue_one(struct virtio_virtq *vq, uint16_t head,
 39 |                              virtq_handle_buffers_cb handle_buffers_cb,
 40 |                              void *arg, bool resubmit);
 41 | 
 42 | static struct virtq_iov_private *clone_iov(struct virtio_virtq *vq)
 43 | {
 44 |     struct virtq_iov_private *priv;
 45 |     uint16_t niov = vq->niov_out + vq->niov_in;
 46 |     size_t iov_size = sizeof(struct vhd_buffer) * niov;
 47 | 
 48 |     priv = vhd_alloc(sizeof(*priv) + iov_size);
 49 |     memcpy(priv->iov.buffers, vq->buffers, iov_size);
 50 |     priv->iov.niov_out = vq->niov_out;
 51 |     priv->iov.iov_out = &priv->iov.buffers[0];
 52 |     priv->iov.niov_in = vq->niov_in;
 53 |     priv->iov.iov_in = &priv->iov.buffers[vq->niov_out];
 54 |     return priv;
 55 | }
 56 | 
 57 | void virtio_free_iov(struct virtio_iov *iov)
 58 | {
 59 |     struct virtq_iov_private *priv =
 60 |         containerof(iov, struct virtq_iov_private, iov);
 61 | 
 62 |     /* matched with ref in virtq_dequeue_one */
 63 |     vhd_memmap_unref(priv->mm);
 64 |     vhd_free(priv);
 65 | }
 66 | 
 67 | uint16_t virtio_iov_get_head(struct virtio_iov *iov)
 68 | {
 69 |     struct virtq_iov_private *priv =
 70 |         containerof(iov, struct virtq_iov_private, iov);
 71 |     return priv->used_head;
 72 | }
 73 | 
 74 | static int add_buffer(struct virtio_virtq *vq, void *addr, size_t len, bool in)
 75 | {
 76 |     uint16_t niov = vq->niov_out + vq->niov_in;
 77 | 
 78 |     if (niov >= vq->max_chain_len) {
 79 |         VHD_OBJ_ERROR(vq, "descriptor chain exceeds max length %u",
 80 |                       vq->max_chain_len);
 81 |         return -ENOBUFS;
 82 |     }
 83 | 
 84 |     if (in) {
 85 |         vq->niov_in++;
 86 |     } else {
 87 |         /*
 88 |          * 2.6.4.2 Driver Requirements: Message Framing The driver MUST place
 89 |          * any device-writable descriptor elements after any device-readable
 90 |          * descriptor elements.
 91 |          */
 92 |         if (vq->niov_in) {
 93 |             VHD_LOG_ERROR("Device-readable buffer after device-writable");
 94 |             return -EINVAL;
 95 |         }
 96 |         vq->niov_out++;
 97 |     }
 98 | 
 99 |     vq->buffers[niov] = (struct vhd_buffer) {
100 |         .base = addr,
101 |         .len = len,
102 |         .write_only = in,
103 |     };
104 | 
105 |     return 0;
106 | }
107 | 
108 | static int map_buffer(struct virtio_virtq *vq, uint64_t gpa, size_t len,
109 |                       bool write_only)
110 | {
111 |     void *addr = gpa_range_to_ptr(vq->mm, gpa, len);
112 |     if (!addr) {
113 |         VHD_OBJ_ERROR(vq, "Failed to map GPA 0x%" PRIx64 ", +0x%zx", gpa, len);
114 |         return -EFAULT;
115 |     }
116 | 
117 |     return add_buffer(vq, addr, len, write_only);
118 | }
119 | 
120 | /* Modify inflight descriptor after dequeue request from the available ring. */
121 | static void virtq_inflight_avail_update(struct virtio_virtq *vq, uint16_t head)
122 | {
123 |     if (!vq->inflight_region) {
124 |         return;
125 |     }
126 | 
127 |     if (vq->inflight_region->desc[head].inflight) {
128 |         VHD_OBJ_WARN(vq, "inflight[%u]=%u (expected 0)", head,
129 |                      vq->inflight_region->desc[head].inflight);
130 |     }
131 | 
132 |     vq->inflight_region->desc[head].counter = vq->req_cnt;
133 |     /*
134 |      * Ensure the inflight region fields are updated in the expected order, so
135 |      * that the next incarnation of the vhost backend can recover the state
136 |      * regardless of where the current one dies.  There's no concurrent access
137 |      * to the inflight region so only a compiler barrier is necessary.
138 |      */
139 |     barrier();
140 |     vq->inflight_region->desc[head].inflight = 1;
141 |     vq->req_cnt++;
142 | }
143 | 
144 | /* Prepare the inflight descriptor for commit. */
145 | static void virtq_inflight_used_update(struct virtio_virtq *vq, uint16_t head)
146 | {
147 |     if (!vq->inflight_region) {
148 |         return;
149 |     }
150 | 
151 |     vq->inflight_region->desc[head].next = vq->inflight_region->last_batch_head;
152 |     /*
153 |      * Ensure the inflight region fields are updated in the expected order, so
154 |      * that the next incarnation of the vhost backend can recover the state
155 |      * regardless of where the current one dies.  There's no concurrent access
156 |      * to the inflight region so only a compiler barrier is necessary.
157 |      */
158 |     barrier();
159 |     vq->inflight_region->last_batch_head = head;
160 | }
161 | 
162 | /* Post commit inflight descriptor handling. */
163 | static void virtq_inflight_used_commit(struct virtio_virtq *vq, uint16_t head)
164 | {
165 |     if (!vq->inflight_region) {
166 |         return;
167 |     }
168 | 
169 |     if (vq->inflight_region->desc[head].inflight != 1) {
170 |         VHD_OBJ_WARN(vq, "inflight[%u]=%u (expected 1)", head,
171 |                      vq->inflight_region->desc[head].inflight);
172 |     }
173 | 
174 |     vq->inflight_region->desc[head].inflight = 0;
175 |     /*
176 |      * Make sure used_idx is stored after the desc content, so that the next
177 |      * incarnation of the vhost backend sees consistent values regardless of
178 |      * where the current one dies.  There's no concurrent access to the
179 |      * inflight region so only a compiler barrier is necessary.
180 |      */
181 |     barrier();
182 |     vq->inflight_region->used_idx = vq->used->idx;
183 | }
184 | 
185 | /*
186 |  * If the value of ``used_idx`` does not match the ``idx`` value of
187 |  * used ring (means the inflight field of ``inflight_split_desc``
188 |  * entries in last batch may be incorrect).
189 |  */
190 | static void virtq_inflight_reconnect_update(struct virtio_virtq *vq)
191 | {
192 |     uint16_t batch_size;
193 |     uint16_t idx;
194 | 
195 |     vq->req_cnt = 0;
196 |     if (!vq->inflight_region) {
197 |         return;
198 |     }
199 | 
200 |     /* Initialize the global req counter for the inflight descriptors. */
201 |     for (idx = 0; idx < vq->inflight_region->desc_num; idx++) {
202 |         if (vq->inflight_region->desc[idx].counter > vq->req_cnt) {
203 |             vq->req_cnt = vq->inflight_region->desc[idx].counter;
204 |         }
205 |     }
206 | 
207 |     /* fresh inflight region (not a reconnect) */
208 |     if (!vq->req_cnt) {
209 |         goto out;
210 |     }
211 | 
212 |     batch_size = vq->used->idx - vq->inflight_region->used_idx;
213 |     if (!batch_size) {
214 |         /* Last batch was sent successfully. Nothing to update. */
215 |         goto out;
216 |     }
217 | 
218 |     /* we don't do batching for now */
219 |     VHD_ASSERT(batch_size == 1);
220 | 
221 |     idx = vq->inflight_region->last_batch_head;
222 |     while (batch_size) {
223 |         vq->inflight_region->desc[idx].inflight = 0;
224 |         idx = vq->inflight_region->desc[idx].next;
225 |         batch_size--;
226 |     }
227 | 
228 | out:
229 |     vq->req_cnt++;
230 |     vq->inflight_region->used_idx = vq->used->idx;
231 | }
232 | 
233 | static void virtio_virtq_reset_stat(struct virtio_virtq *vq)
234 | {
235 |     memset(&vq->stat, 0, sizeof(vq->stat));
236 | }
237 | 
238 | /*
239 |  * Windows drivers violate the spec and create descriptor chains up to this
240 |  * long, regardless of the queue size.
241 |  */
242 | #define WINDOWS_CHAIN_LEN_MAX   (512 + 3)
243 | 
244 | void virtio_virtq_init(struct virtio_virtq *vq)
245 | {
246 |     VHD_ASSERT(!vq->buffers);
247 | 
248 |     vq->max_chain_len = MAX(vq->qsz, WINDOWS_CHAIN_LEN_MAX);
249 | 
250 |     vq->buffers = vhd_calloc(vq->max_chain_len, sizeof(vq->buffers[0]));
251 | 
252 |     /* Make check on the first virtq dequeue. */
253 |     vq->inflight_check = true;
254 |     virtq_inflight_reconnect_update(vq);
255 | 
256 |     virtio_virtq_reset_stat(vq);
257 | }
258 | 
259 | void virtio_virtq_release(struct virtio_virtq *vq)
260 | {
261 |     VHD_ASSERT(vq->buffers);
262 |     vhd_free(vq->buffers);
263 |     *vq = (struct virtio_virtq) {};
264 | }
265 | 
266 | struct inflight_resubmit {
267 |     uint64_t counter;
268 |     uint16_t head;
269 | };
270 | 
271 | static int inflight_resubmit_compare(const void *first, const void *second)
272 | {
273 |     struct inflight_resubmit *left = (struct inflight_resubmit *)first;
274 |     struct inflight_resubmit *right = (struct inflight_resubmit *)second;
275 | 
276 |     if (left->counter < right->counter) {
277 |         return -1;
278 |     }
279 |     /* Can't return 0, since counter values are always different. */
280 | 
281 |     return 1;
282 | }
283 | 
284 | /* Resubmit inflight requests on the virtqueue start. */
285 | static int virtq_inflight_resubmit(struct virtio_virtq *vq,
286 |                                    virtq_handle_buffers_cb handle_buffers_cb,
287 |                                    void *arg)
288 | {
289 |     uint16_t desc_num;
290 |     uint16_t cnt;
291 |     struct inflight_resubmit *resubmit_array;
292 |     uint16_t i;
293 |     int res;
294 | 
295 |     if (!vq->inflight_region) {
296 |         return 0;
297 |     }
298 | 
299 |     desc_num = vq->inflight_region->desc_num;
300 |     cnt = 0;
301 |     resubmit_array = alloca(sizeof(*resubmit_array) * desc_num);
302 |     for (i = 0; i < desc_num; i++) {
303 |         if (vq->inflight_region->desc[i].inflight) {
304 |             resubmit_array[cnt].counter = vq->inflight_region->desc[i].counter;
305 |             resubmit_array[cnt].head = i;
306 |             cnt++;
307 |         }
308 |     }
309 |     qsort(resubmit_array, cnt, sizeof(*resubmit_array),
310 |             inflight_resubmit_compare);
311 | 
312 |     res = 0;
313 |     VHD_OBJ_DEBUG(vq, "cnt = %d inflight requests should be resubmitted", cnt);
314 |     for (i = 0; i < cnt; i++) {
315 |         uint16_t head = resubmit_array[i].head;
316 |         if (head >= vq->qsz) {
317 |             VHD_OBJ_ERROR(vq, "resubmit desc %u: head %u past queue size %u",
318 |                           i, head, vq->qsz);
319 |             return -ERANGE;
320 |         }
321 | 
322 |         res = virtq_dequeue_one(vq, head, handle_buffers_cb, arg, true);
323 |         if (res) {
324 |             break;
325 |         }
326 |     }
327 | 
328 |     return res;
329 | }
330 | 
331 | bool virtq_is_broken(struct virtio_virtq *vq)
332 | {
333 |     return vq->broken;
334 | }
335 | 
336 | void mark_broken(struct virtio_virtq *vq)
337 | {
338 |     vq->broken = true;
339 | }
340 | 
341 | #define DESCRIPTOR_ERROR(vq, idx, desc, fmt, ...)                       \
342 |     VHD_OBJ_ERROR(vq, "[%u]{0x%" PRIx64 ", +0x%x, 0x%x, %u}: " fmt,     \
343 |                   (idx), (desc)->addr, (desc)->len,                     \
344 |                   (desc)->flags, (desc)->next, ##__VA_ARGS__)
345 | 
346 | static int walk_indirect_table(struct virtio_virtq *vq,
347 |                                const struct virtq_desc *table_desc)
348 | {
349 |     int res;
350 |     struct virtq_desc desc;
351 |     struct virtq_desc *desc_table;
352 |     uint16_t table_len = table_desc->len / sizeof(desc);
353 |     uint16_t idx;
354 | 
355 |     if (table_desc->len == 0 || table_desc->len % sizeof(desc)) {
356 |         VHD_OBJ_ERROR(vq, "Bad indirect descriptor table length %u",
357 |                       table_desc->len);
358 |         return -EINVAL;
359 |     }
360 | 
361 |     desc_table = gpa_range_to_ptr(vq->mm, table_desc->addr, table_desc->len);
362 |     if (!desc_table) {
363 |         VHD_OBJ_ERROR(vq, "Failed to map indirect descriptor table "
364 |                       "GPA 0x%" PRIx64 ", +0x%x",
365 |                       table_desc->addr, table_desc->len);
366 |         return -EFAULT;
367 |     }
368 | 
369 |     for (idx = 0; ; idx = desc.next) {
370 |         desc = desc_table[idx];
371 | 
372 |         if (desc.flags & VIRTQ_DESC_F_INDIRECT) {
373 |             DESCRIPTOR_ERROR(vq, idx, &desc, "nested indirect descriptor");
374 |             return -EMLINK;
375 |         }
376 | 
377 |         res = map_buffer(vq, desc.addr, desc.len,
378 |                          desc.flags & VIRTQ_DESC_F_WRITE);
379 |         if (res != 0) {
380 |             DESCRIPTOR_ERROR(vq, idx, &desc,
381 |                              "failed to map descriptor in indirect table");
382 |             return res;
383 |         }
384 | 
385 |         if (!(desc.flags & VIRTQ_DESC_F_NEXT)) {
386 |             break;
387 |         }
388 | 
389 |         if (desc.next >= table_len) {
390 |             DESCRIPTOR_ERROR(vq, idx, &desc,
391 |                              "next points past indirect table size %u",
392 |                              table_len);
393 |             return -ERANGE;
394 |         }
395 |     }
396 | 
397 |     return 0;
398 | }
399 | 
400 | /*
401 |  * Traverse a descriptor chain starting at @head, mapping the descriptors found
402 |  * and pushing them onto @vq->buffers.
403 |  * Return the number of descriptors consumed, or -errno.
404 |  */
405 | static int walk_chain(struct virtio_virtq *vq, uint16_t head)
406 | {
407 |     uint16_t idx;
408 |     uint16_t chain_len;
409 |     struct virtq_desc desc;
410 |     int res;
411 | 
412 |     vq->niov_out = vq->niov_in = 0;
413 | 
414 |     for (idx = head, chain_len = 1; ; idx = desc.next, chain_len++) {
415 |         desc = vq->desc[idx];
416 | 
417 |         if (desc.flags & VIRTQ_DESC_F_INDIRECT) {
418 |             if (desc.flags & VIRTQ_DESC_F_NEXT) {
419 |                 DESCRIPTOR_ERROR(vq, idx, &desc,
420 |                                  "indirect descriptor must have no next");
421 |                 return -EINVAL;
422 |             }
423 | 
424 |             res = walk_indirect_table(vq, &desc);
425 |             if (res != 0) {
426 |                 DESCRIPTOR_ERROR(vq, idx, &desc,
427 |                                  "failed to walk indirect descriptor table");
428 |                 return res;
429 |             }
430 | 
431 |             break;
432 |         }
433 | 
434 |         res = map_buffer(vq, desc.addr, desc.len,
435 |                          desc.flags & VIRTQ_DESC_F_WRITE);
436 |         if (res != 0) {
437 |             DESCRIPTOR_ERROR(vq, idx, &desc, "failed to map");
438 |             return res;
439 |         }
440 | 
441 |         if (!(desc.flags & VIRTQ_DESC_F_NEXT)) {
442 |             break;
443 |         }
444 | 
445 |         if (desc.next >= vq->qsz) {
446 |             DESCRIPTOR_ERROR(vq, idx, &desc,
447 |                              "next points past queue size %u", vq->qsz);
448 |             return -ERANGE;
449 |         }
450 |     }
451 | 
452 |     return chain_len;
453 | }
454 | 
455 | int virtq_dequeue_many(struct virtio_virtq *vq,
456 |                        virtq_handle_buffers_cb handle_buffers_cb,
457 |                        void *arg)
458 | {
459 |     int res;
460 |     uint16_t i;
461 |     uint16_t num_avail;
462 |     uint16_t avail, avail2;
463 |     time_t now;
464 | 
465 |     if (virtq_is_broken(vq)) {
466 |         VHD_OBJ_ERROR(vq, "virtqueue is broken, cannot process");
467 |         return -ENODEV;
468 |     }
469 | 
470 |     if (vq->inflight_check) {
471 |         /* Check for the inflight requests once at the start. */
472 |         VHD_OBJ_DEBUG(vq, "resubmit inflight requests, if any");
473 |         res = virtq_inflight_resubmit(vq, handle_buffers_cb, arg);
474 |         if (res) {
475 |             goto queue_broken;
476 |         }
477 |         vq->inflight_check = false;
478 |     }
479 | 
480 |     now = time(NULL);
481 | 
482 |     if (now - vq->stat.period_start_ts > 60) {
483 |         vq->stat.period_start_ts = now;
484 |         vq->stat.metrics.queue_len_max_60s = 0;
485 |     }
486 | 
487 |     vq->stat.metrics.dispatch_total++;
488 | 
489 |     avail = vq->avail->idx;
490 |     if (vq->has_event_idx) {
491 |         smp_mb(); /* avail->idx read followed by avail_event write */
492 |         while (true) {
493 |             virtq_set_avail_event(vq, avail);
494 |             smp_mb(); /* avail_event write followed by avail->idx read */
495 |             avail2 = vq->avail->idx;
496 |             if (avail2 == avail) {
497 |                 break;
498 |             }
499 |             smp_mb(); /* avail->idx read followed by avail_event write */
500 |             avail = avail2;
501 |         }
502 |     }
503 | 
504 |     num_avail = avail - vq->last_avail;
505 |     if (num_avail > vq->qsz) {
506 |         VHD_OBJ_ERROR(vq, "num_avail %u (%u - %u) exceeds queue size %u",
507 |                       num_avail, avail, vq->last_avail, vq->qsz);
508 |         return -EOVERFLOW;
509 |     }
510 | 
511 |     if (!num_avail) {
512 |         vq->stat.metrics.dispatch_empty++;
513 |         return 0;
514 |     }
515 | 
516 |     vq->stat.metrics.queue_len_last = num_avail;
517 |     if (vq->stat.metrics.queue_len_last > vq->stat.metrics.queue_len_max_60s) {
518 |         vq->stat.metrics.queue_len_max_60s = vq->stat.metrics.queue_len_last;
519 |     }
520 | 
521 |     /* Make sure that further desc reads do not pass avail->idx read. */
522 |     smp_rmb();                  /* barrier pair [A] */
523 | 
524 |     /* TODO: disable extra notifies from this point */
525 | 
526 |     for (i = 0; i < num_avail; ++i) {
527 |         /* Grab next descriptor head */
528 |         uint16_t head = vq->avail->ring[vq->last_avail % vq->qsz];
529 |         if (head >= vq->qsz) {
530 |             VHD_OBJ_ERROR(vq, "avail %u: head %u past queue size %u",
531 |                           vq->last_avail, head, vq->qsz);
532 |             return -ERANGE;
533 |         }
534 | 
535 |         res = virtq_dequeue_one(vq, head, handle_buffers_cb, arg, false);
536 |         if (res) {
537 |             goto queue_broken;
538 |         }
539 | 
540 |         vq->stat.metrics.request_total++;
541 |     }
542 | 
543 |     /* TODO: restore notifier mask here */
544 |     return 0;
545 | 
546 | queue_broken:
547 |     mark_broken(vq);
548 |     return res;
549 | }
550 | 
551 | static int virtq_dequeue_one(struct virtio_virtq *vq, uint16_t head,
552 |                              virtq_handle_buffers_cb handle_buffers_cb,
553 |                              void *arg, bool resubmit)
554 | {
555 |     int ret;
556 | 
557 |     ret = walk_chain(vq, head);
558 |     if (ret < 0) {
559 |         return ret;
560 |     }
561 | 
562 |     /* Create iov copy from stored buffer for client handling */
563 |     struct virtq_iov_private *priv = clone_iov(vq);
564 |     priv->used_head = head;
565 |     priv->mm = vq->mm;
566 |     /* matched with unref in virtio_free_iov */
567 |     vhd_memmap_ref(priv->mm);
568 | 
569 |     if (!resubmit) {
570 |         virtq_inflight_avail_update(vq, head);
571 |     }
572 | 
573 |     /* Send this over to handler */
574 |     handle_buffers_cb(arg, vq, &priv->iov);
575 | 
576 |     vq->last_avail++;
577 | 
578 |     return 0;
579 | }
580 | 
581 | static void vhd_log_buffers(struct vhd_memory_log *log,
582 |                             struct vhd_memory_map *mm,
583 |                             struct virtio_iov *viov)
584 | {
585 |     uint16_t i;
586 |     for (i = 0; i < viov->niov_in; ++i) {
587 |         struct vhd_buffer *iov = &viov->iov_in[i];
588 |         vhd_mark_range_dirty(log, mm, iov->base, iov->len);
589 |     }
590 | }
591 | 
592 | /*
593 |  * NOTE: this @mm is the one the request was started with, not the current one
594 |  * on @vq
595 |  */
596 | static void vhd_log_modified(struct virtio_virtq *vq,
597 |                              struct vhd_memory_map *mm,
598 |                              struct virtio_iov *iov,
599 |                              uint16_t used_idx)
600 | {
601 |     /* log modifications of buffers in descr */
602 |     vhd_log_buffers(vq->log, mm, iov);
603 |     if (vq->flags & VHOST_VRING_F_LOG) {
604 |         /* log modification of used->idx */
605 |         vhd_mark_gpa_range_dirty(vq->log,
606 |                                  vq->used_gpa_base +
607 |                                  offsetof(struct virtq_used, idx),
608 |                                  sizeof(vq->used->idx));
609 |         /* log modification of used->ring[idx] */
610 |         vhd_mark_gpa_range_dirty(vq->log,
611 |                                  vq->used_gpa_base +
612 |                                  offsetof(struct virtq_used, ring[used_idx]),
613 |                                  sizeof(vq->used->ring[0]));
614 |     }
615 | }
616 | 
617 | static void virtq_do_notify(struct virtio_virtq *vq)
618 | {
619 |     if (vq->notify_fd != -1) {
620 |         eventfd_write(vq->notify_fd, 1);
621 |     }
622 | }
623 | 
624 | static bool virtq_need_notify(struct virtio_virtq *vq)
625 | {
626 |     if (!vq->has_event_idx) {
627 |         /*
628 |          * Virtio specification v1.0, 5.1.6.2.3:
629 |          * Often a driver will suppress transmission interrupts using the
630 |          * VIRTQ_AVAIL_F_NO_INTERRUPT flag (see 3.2.2 Receiving Used Buffers
631 |          * From The Device) and check for used packets in the transmit path
632 |          * of following packets.
633 |          */
634 |         return !(vq->avail->flags & VIRTQ_AVAIL_F_NO_INTERRUPT);
635 |     }
636 | 
637 |     /*
638 |      * Virtio specification v1.0, 2.4.7.2:
639 |      * if the VIRTIO_F_EVENT_IDX feature bit is negotiated:
640 |      * The device MUST ignore the lower bit of flags.
641 |      * If the idx field in the used ring was
642 |      * equal to used_event, the device MUST send an interrupt.
643 |      * --------------------------------------------------------
644 |      * Note: code below assumes that virtq_notify is always called
645 |      * per one completion, and never per batch.
646 |      */
647 |     return virtq_get_used_event(vq) == (uint16_t)(vq->used->idx - 1);
648 | }
649 | 
650 | static void virtq_notify(struct virtio_virtq *vq)
651 | {
652 |     /* expose used ring entries before checking used event */
653 |     smp_mb();
654 | 
655 |     if (virtq_need_notify(vq)) {
656 |         virtq_do_notify(vq);
657 |     }
658 | }
659 | 
660 | void virtq_push(struct virtio_virtq *vq, struct virtio_iov *iov, uint32_t len)
661 | {
662 |     /* Put buffer head index and len into used ring */
663 |     struct virtq_iov_private *priv = containerof(iov, struct virtq_iov_private,
664 |                                                  iov);
665 |     uint16_t used_idx = vq->used->idx % vq->qsz;
666 |     struct virtq_used_elem *used = &vq->used->ring[used_idx];
667 |     used->id = priv->used_head;
668 |     used->len = len;
669 | 
670 |     virtq_inflight_used_update(vq, used->id);
671 | 
672 |     smp_wmb();                  /* barrier pair [A] */
673 |     vq->used->idx++;
674 | 
675 |     virtq_inflight_used_commit(vq, used->id);
676 |     VHD_OBJ_DEBUG(vq, "head = %d", priv->used_head);
677 | 
678 |     /* use memmap the request was started with rather than the current one */
679 |     if (vq->log) {
680 |         vhd_log_modified(vq, priv->mm, &priv->iov, used_idx);
681 |     }
682 | 
683 |     virtq_notify(vq);
684 |     vq->stat.metrics.request_completed++;
685 | }
686 | 
687 | void virtq_set_notify_fd(struct virtio_virtq *vq, int fd)
688 | {
689 |     vq->notify_fd = fd;
690 | 
691 |     /*
692 |      * Always notify new fd because on initial setup QEMU sets up kick_fd
693 |      * before call_fd, so before call_fd becomes configured there can be
694 |      * already processed descriptors that guest wasn't notified about.
695 |      * And on reconnect connection may have been lost before the server has
696 |      * had a chance to signal guest.
697 |      */
698 |     virtq_do_notify(vq);
699 | }
700 | 
701 | void virtio_virtq_get_stat(struct virtio_virtq *vq,
702 |                            struct vhd_vq_metrics *metrics)
703 | {
704 |     *metrics = vq->stat.metrics;
705 | }
706 | 
707 | __attribute__((weak))
708 | void abort_request(struct virtio_virtq *vq, struct virtio_iov *iov)
709 | {
710 |     /*
711 |      * FIXME: this is called when the message framing is messed up.  This
712 |      * appears severe enough to just stop processing the virtq and mark it
713 |      * broken
714 |      */
715 |     VHD_LOG_ERROR("no valid virtio request found, queue %p should be aborted", vq);
716 |     virtq_push(vq, iov, 0);
717 |     virtio_free_iov(iov);
718 | }
719 | 


--------------------------------------------------------------------------------