├── README.md ├── buse.png ├── kernel ├── Makefile ├── buse-blkdev.c ├── buse-blkdev.h ├── buse-chrdev.c ├── buse-chrdev.h ├── buse-configfs.c ├── buse-configfs.h ├── buse-rqueue.c ├── buse-rqueue.h ├── buse-wqueue.c ├── buse-wqueue.h ├── main.c └── main.h └── lib └── go └── buse ├── buse.go └── go.mod /README.md: -------------------------------------------------------------------------------- 1 | # BUSE: Block Device in Userspace 2 | 3 | ## Write performance comparison 4 | 5 |

6 | 7 | ## Requirements 8 | 9 | * GNU Make 10 | * Linux Kernel 5.11 or newer 11 | * Linux Kernel Headers 12 | 13 | ## Installation 14 | 15 | ``` 16 | cd kernel 17 | make 18 | sudo make install 19 | sudo modprobe buse 20 | ``` 21 | -------------------------------------------------------------------------------- /buse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asch/buse/f12ccb1d15a93539b0df1119b3ce2055ba51f50b/buse.png -------------------------------------------------------------------------------- /kernel/Makefile: -------------------------------------------------------------------------------- 1 | obj-m += buse.o 2 | buse-objs := \ 3 | main.o \ 4 | buse-blkdev.o \ 5 | buse-chrdev.o \ 6 | buse-wqueue.o \ 7 | buse-rqueue.o \ 8 | buse-configfs.o 9 | 10 | MODULEDIR := /lib/modules/$(shell uname -r) 11 | KERNELDIR := $(MODULEDIR)/build 12 | 13 | SOURCES := $(wildcard *.c) 14 | HEADERS := $(wildcard *.h) 15 | 16 | #CC += -DDEBUG 17 | 18 | build: buse.ko 19 | 20 | buse.ko: $(SOURCES) $(HEADERS) 21 | make -C $(KERNELDIR) M=$(shell pwd) modules 22 | 23 | install: buse.ko 24 | install -D -m 644 $(shell pwd)/$< $(MODULEDIR)/extra/$< 25 | strip --strip-debug $(MODULEDIR)/extra/$< 26 | depmod 27 | 28 | clean: 29 | make -C $(KERNELDIR) M=$(shell pwd) clean 30 | -------------------------------------------------------------------------------- /kernel/buse-blkdev.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "buse-blkdev.h" 10 | #include "buse-rqueue.h" 11 | #include "buse-wqueue.h" 12 | #include "main.h" 13 | 14 | /* 15 | * Init function called for every queue of created device. We just fill user data and compute the 16 | * queue id. 17 | */ 18 | static int buse_init_hctx(struct blk_mq_hw_ctx *hw_ctx, void *driver_data, unsigned int hw_ctx_id) 19 | { 20 | struct buse *buse = hw_ctx->queue->queuedata; 21 | struct buse_queue *q = buse->queues; 22 | 23 | q[hw_ctx_id].id = hw_ctx_id; 24 | hw_ctx->driver_data = &q[hw_ctx_id]; 25 | buse->num_queues++; 26 | 27 | return 0; 28 | } 29 | 30 | /* 31 | * io request callback called for every io in the queue. This function is called by blk-mq. 32 | */ 33 | static blk_status_t buse_queue_rq(struct blk_mq_hw_ctx *hw_ctx, const struct blk_mq_queue_data *data) 34 | { 35 | struct request *r = data->rq; 36 | struct buse_cmd *cmd = blk_mq_rq_to_pdu(r); 37 | struct buse *buse; 38 | 39 | cmd->rq = r; 40 | cmd->queue = hw_ctx->driver_data; 41 | cmd->canceled = false; 42 | cmd->magic = BUSE_MAGIC; 43 | 44 | buse = cmd->queue->r.buse; 45 | if (atomic_read(&buse->stopped) == 1) 46 | return BLK_STS_IOERR; 47 | 48 | switch (req_op(r)) { 49 | case REQ_OP_DISCARD: 50 | case REQ_OP_WRITE_SAME: 51 | case REQ_OP_WRITE_ZEROES: 52 | case REQ_OP_SECURE_ERASE: 53 | case REQ_OP_WRITE: 54 | return buse_write(cmd); 55 | case REQ_OP_FLUSH: 56 | return buse_flush(cmd); 57 | case REQ_OP_READ: 58 | return buse_read(cmd); 59 | } 60 | 61 | pr_warn("Unsupported request no. %d\n", req_op(r)); 62 | 63 | return BLK_STS_IOERR; 64 | } 65 | 66 | static const struct block_device_operations buse_blkdev_ops = { 67 | .owner = THIS_MODULE, 68 | }; 69 | 70 | /* 71 | * When io request times out we just print warning to the dmesg a give it another chance. This is 72 | * the best we can do. If the device is eventually stopped, these requests will be canceled. 73 | */ 74 | static enum blk_eh_timer_return buse_timeout(struct request *rq, bool b) 75 | { 76 | pr_warn("Request timed out! Is userspace connected? (rq = %p)\n", rq); 77 | 78 | return BLK_EH_RESET_TIMER; 79 | } 80 | 81 | /* 82 | * Control structure for blk-mq operations. 83 | */ 84 | static const struct blk_mq_ops buse_mq_ops = { 85 | .init_hctx = buse_init_hctx, 86 | .queue_rq = buse_queue_rq, 87 | .timeout = buse_timeout, 88 | }; 89 | 90 | /* 91 | * blk-mq tags initialization. 92 | */ 93 | static void buse_set_tag_set(struct buse *buse) 94 | { 95 | struct blk_mq_tag_set *tag_set = &buse->blkdev.tag_set; 96 | 97 | tag_set->cmd_size = sizeof(struct buse_cmd); 98 | tag_set->driver_data = buse; 99 | tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 100 | if (buse->no_scheduler) 101 | tag_set->flags |= BLK_MQ_F_NO_SCHED; 102 | tag_set->nr_hw_queues = buse->hw_queues; 103 | tag_set->numa_node = NUMA_NO_NODE; 104 | tag_set->ops = &buse_mq_ops; 105 | tag_set->queue_depth = buse->queue_depth; 106 | } 107 | 108 | /* 109 | * Block device initialization. All configuration parameters are set according to the configured 110 | * values in struct buse. This is only related to the block device side of the module. 111 | */ 112 | int buse_blkdev_init(struct buse *buse) 113 | { 114 | int ret; 115 | struct buse_blkdev *blkdev = &buse->blkdev; 116 | struct blk_mq_tag_set *tag_set = &blkdev->tag_set; 117 | size_t max_writes = buse->write_chunk_size / buse->block_size; 118 | size_t writelist_size = max_writes * sizeof(struct writelist_item); 119 | unsigned int max_hw_sectors; 120 | 121 | blkdev->disk = alloc_disk_node(1, NUMA_NO_NODE); 122 | if (!blkdev->disk) { 123 | ret = -ENOMEM; 124 | goto err; 125 | } 126 | 127 | buse_set_tag_set(buse); 128 | 129 | ret = blk_mq_alloc_tag_set(tag_set); 130 | if (ret) 131 | goto err_disk; 132 | 133 | blkdev->request_queue = blk_mq_init_queue_data(tag_set, buse); 134 | if (IS_ERR(blkdev->request_queue)) { 135 | ret = PTR_ERR(blkdev->request_queue); 136 | goto err_tag; 137 | } 138 | 139 | blk_queue_write_cache(blkdev->request_queue, true, false); 140 | 141 | max_hw_sectors = (buse->write_chunk_size - writelist_size) / SECTOR_SIZE; 142 | if (max_hw_sectors > buse->read_shm_size / SECTOR_SIZE) 143 | max_hw_sectors = buse->read_shm_size / SECTOR_SIZE; 144 | blk_queue_max_hw_sectors(blkdev->request_queue, max_hw_sectors); 145 | 146 | blk_queue_flag_set(QUEUE_FLAG_NONROT, blkdev->request_queue); 147 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, blkdev->request_queue); 148 | blk_queue_logical_block_size(blkdev->request_queue, buse->block_size); 149 | blk_queue_physical_block_size(blkdev->request_queue, buse->block_size); 150 | 151 | if (buse->io_min < buse->block_size || buse->io_min % buse->block_size != 0) 152 | buse->io_min = buse->block_size; 153 | blk_queue_io_min(blkdev->request_queue, buse->io_min); 154 | 155 | if (buse->io_opt < buse->block_size || buse->io_opt % buse->block_size != 0) 156 | buse->io_opt = buse->block_size; 157 | blk_queue_io_opt(blkdev->request_queue, buse->io_opt); 158 | 159 | blk_queue_max_segments(blkdev->request_queue, USHRT_MAX); 160 | blk_queue_max_segment_size(blkdev->request_queue, UINT_MAX); 161 | 162 | if (buse->can_write_same) 163 | blk_queue_max_write_same_sectors(blkdev->request_queue, UINT_MAX); 164 | 165 | if (buse->can_write_zeroes) 166 | blk_queue_max_write_zeroes_sectors(blkdev->request_queue, UINT_MAX); 167 | 168 | if (buse->can_discard) { 169 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, blkdev->request_queue); 170 | blkdev->request_queue->limits.discard_granularity = buse->block_size; 171 | blkdev->request_queue->limits.discard_alignment = buse->block_size; 172 | 173 | blk_queue_max_discard_sectors(blkdev->request_queue, UINT_MAX); 174 | blk_queue_max_discard_segments(blkdev->request_queue, USHRT_MAX); 175 | } 176 | 177 | if (buse->can_secure_erase) 178 | blk_queue_flag_set(QUEUE_FLAG_SECERASE, blkdev->request_queue); 179 | 180 | return 0; 181 | 182 | err_tag: 183 | blk_mq_free_tag_set(tag_set); 184 | err_disk: 185 | put_disk(blkdev->disk); 186 | err: 187 | return ret; 188 | } 189 | 190 | /* 191 | * Remove the block device if it was created, otherwise just cleanup tagset. 192 | */ 193 | void buse_blkdev_exit(struct buse *buse) 194 | { 195 | if (buse->blkdev.created) { 196 | del_gendisk(buse->blkdev.disk); 197 | put_disk(buse->blkdev.disk); 198 | } 199 | 200 | blk_cleanup_queue(buse->blkdev.request_queue); 201 | blk_mq_free_tag_set(&buse->blkdev.tag_set); 202 | buse->blkdev.created = false; 203 | } 204 | 205 | /* 206 | * Registers the block device so that it is visible to the system. 207 | */ 208 | void buse_gendisk_register(struct buse *buse) 209 | { 210 | struct gendisk *disk = buse->blkdev.disk; 211 | 212 | disk->major = buse_blkdev_major; 213 | disk->minors = buse_blkdev_max_minors; 214 | disk->first_minor = buse->index * disk->minors; 215 | disk->flags |= GENHD_FL_EXT_DEVT; 216 | disk->fops = &buse_blkdev_ops; 217 | disk->private_data = buse; 218 | disk->queue = buse->blkdev.request_queue; 219 | snprintf(disk->disk_name, DISK_NAME_LEN, "%s%llu", buse_blkdev_name, buse->index); 220 | 221 | /* Capacity needs to be set to 0, otherwise add_disk() hangs! Correct 222 | * capacity is set afterwards. */ 223 | set_capacity(buse->blkdev.disk, 0); 224 | add_disk(disk); 225 | set_capacity(buse->blkdev.disk, buse->size >> SECTOR_SHIFT); 226 | } 227 | 228 | /* 229 | * Returns numa node for given queue id. 230 | */ 231 | int buse_get_numa_node_for_queue_id(struct buse *buse, int queue_id) 232 | { 233 | int i; 234 | struct blk_mq_queue_map *qmap = &buse->blkdev.tag_set.map[HCTX_TYPE_DEFAULT]; 235 | 236 | for_each_possible_cpu(i) { 237 | if (queue_id == qmap->mq_map[i]) 238 | return cpu_to_node(i); 239 | } 240 | 241 | return NUMA_NO_NODE; 242 | } 243 | -------------------------------------------------------------------------------- /kernel/buse-blkdev.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_BLKDEV_H 4 | #define BUSE_BLKDEV_H 5 | 6 | #include "main.h" 7 | 8 | /* 9 | * Block device initialization. All configuration parameters are set according to the configured 10 | * values in struct buse. This is only related to the block device side of the module. 11 | */ 12 | int buse_blkdev_init(struct buse *buse); 13 | 14 | 15 | /* 16 | * Remove the block device if it was created, otherwiese just cleanup tagset. 17 | */ 18 | void buse_blkdev_exit(struct buse *buse); 19 | 20 | 21 | /* 22 | * Registers the block device so that it is visible to the system. 23 | */ 24 | void buse_gendisk_register(struct buse *buse); 25 | 26 | /* 27 | * Returns numa node for given queue id. 28 | */ 29 | int buse_get_numa_node_for_queue_id(struct buse *buse, int queue_id); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /kernel/buse-chrdev.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "buse-blkdev.h" 12 | #include "buse-chrdev.h" 13 | #include "buse-rqueue.h" 14 | #include "buse-wqueue.h" 15 | #include "main.h" 16 | 17 | /* 18 | * Callback for mmap(). It is reserved for future usage. 19 | */ 20 | static void vm_open(struct vm_area_struct *vma) 21 | { 22 | } 23 | 24 | /* 25 | * Callback for munap(). It is reserved for future usage. 26 | */ 27 | static void vm_close(struct vm_area_struct *vma) 28 | { 29 | } 30 | 31 | /* 32 | * VM fault callback for write queue. First pass through the shared memory generates faults and 33 | * fills the address mapping. 34 | */ 35 | static vm_fault_t vm_fault_wqueue(struct vm_fault *vmf) 36 | { 37 | struct buse_wqueue *wq = vmf->vma->vm_private_data; 38 | pgoff_t offset = vmf->pgoff << PAGE_SHIFT; 39 | struct page *page; 40 | 41 | if (offset >= wq->buse->write_shm_size) 42 | return -EFAULT; 43 | 44 | page = vmalloc_to_page(wq->shmem + offset); 45 | 46 | get_page(page); 47 | vmf->page = page; 48 | 49 | return 0; 50 | } 51 | 52 | /* 53 | * VM fault callback for read queue. First pass through the shared memory generates faults and fills 54 | * the address mapping. 55 | */ 56 | static vm_fault_t vm_fault_rqueue(struct vm_fault *vmf) 57 | { 58 | struct buse_rqueue *rq = vmf->vma->vm_private_data; 59 | pgoff_t offset = vmf->pgoff << PAGE_SHIFT; 60 | struct page *page; 61 | 62 | if (offset >= rq->buse->write_shm_size) 63 | return -EFAULT; 64 | 65 | page = vmalloc_to_page(rq->shmem + offset); 66 | 67 | get_page(page); 68 | vmf->page = page; 69 | 70 | return 0; 71 | } 72 | 73 | struct buse_wqueue *inode_get_wqueue(struct inode *inode) 74 | { 75 | return container_of(inode->i_cdev, struct buse_wqueue, chrdev.cdev); 76 | } 77 | 78 | struct buse_rqueue *inode_get_rqueue(struct inode *inode) 79 | { 80 | return container_of(inode->i_cdev, struct buse_rqueue, chrdev.cdev); 81 | } 82 | 83 | /* 84 | * File close() callback for write queue. We immediately set that the queue is unbound. 85 | */ 86 | static int chrdev_release_wqueue(struct inode *inode, struct file *file) 87 | { 88 | struct buse_wqueue *wq = inode_get_wqueue(inode); 89 | if (!wq || atomic_read(&wq->bound) == 0) 90 | return -EFAULT; 91 | 92 | atomic_set(&wq->bound, 0); 93 | 94 | return 0; 95 | } 96 | 97 | /* 98 | * File close() callback for read queue. We immediately set that the queue is unbound. 99 | */ 100 | static int chrdev_release_rqueue(struct inode *inode, struct file *file) 101 | { 102 | struct buse_rqueue *rq = inode_get_rqueue(inode); 103 | if (!rq || atomic_read(&rq->bound) == 0) 104 | return -EFAULT; 105 | 106 | atomic_set(&rq->bound, 0); 107 | 108 | return 0; 109 | } 110 | 111 | /* 112 | * File open() callback for write queue. We immediately set that the queue is bound. 113 | */ 114 | static int chrdev_open_wqueue(struct inode *inode, struct file *file) 115 | { 116 | struct buse_wqueue *wq = inode_get_wqueue(inode); 117 | if (!wq || atomic_read(&wq->bound) == 1) 118 | return -EFAULT; 119 | 120 | file->private_data = wq; 121 | buse_wqueue_bind(wq); 122 | 123 | return 0; 124 | } 125 | 126 | /* 127 | * File open() callback for read queue. We immediately set that the queue is bound. 128 | */ 129 | static int chrdev_open_rqueue(struct inode *inode, struct file *file) 130 | { 131 | struct buse_rqueue *rq = inode_get_rqueue(inode); 132 | if (!rq || atomic_read(&rq->bound) == 1) 133 | return -EFAULT; 134 | 135 | file->private_data = rq; 136 | buse_rqueue_bind(rq); 137 | 138 | return 0; 139 | } 140 | 141 | static struct vm_operations_struct vm_ops_wqueue = { 142 | .close = vm_close, 143 | .fault = vm_fault_wqueue, 144 | .open = vm_open, 145 | }; 146 | 147 | static struct vm_operations_struct vm_ops_rqueue = { 148 | .close = vm_close, 149 | .fault = vm_fault_rqueue, 150 | .open = vm_open, 151 | }; 152 | 153 | /* 154 | * File mmap() callback for write queue. 155 | */ 156 | static int chrdev_mmap_wqueue(struct file *file, struct vm_area_struct *vma) 157 | { 158 | vma->vm_ops = &vm_ops_wqueue; 159 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 160 | vma->vm_private_data = file->private_data; 161 | vm_open(vma); 162 | 163 | return 0; 164 | } 165 | 166 | /* 167 | * File mmap() callback for read queue. 168 | */ 169 | static int chrdev_mmap_rqueue(struct file *file, struct vm_area_struct *vma) 170 | { 171 | vma->vm_ops = &vm_ops_rqueue; 172 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 173 | vma->vm_private_data = file->private_data; 174 | vm_open(vma); 175 | 176 | return 0; 177 | } 178 | 179 | /* 180 | * File write() callback for read queue. Writing to the read queue character device the userspace 181 | * acknowledge that the read request is done. The written value is offset to the shared memory. 182 | */ 183 | ssize_t chrdev_write_rqueue(struct file *file, const char __user *buf, size_t len, loff_t *off) 184 | { 185 | u64 data_offset; 186 | unsigned long ret; 187 | struct buse_rqueue *rq = inode_get_rqueue(file->f_inode); 188 | 189 | if (len != 8) { 190 | BUG(); 191 | return 0; 192 | } 193 | 194 | if (*off != 0) { 195 | BUG(); 196 | return 0; 197 | } 198 | 199 | ret = copy_from_user(&data_offset, buf, len); 200 | if (ret) { 201 | pr_alert("Cannot copy\n"); 202 | return -ENOMEM; 203 | } 204 | 205 | ack_read_request(rq, data_offset, false); 206 | 207 | *off = 0; 208 | 209 | return len; 210 | } 211 | 212 | /* 213 | * File write() callback for write queue. Writing to the write queue character device the userspace 214 | * acknowledge that the write chunk was processed. The written value is offset to the shared memory. 215 | */ 216 | ssize_t chrdev_write_wqueue(struct file *file, const char __user *buf, size_t len, loff_t *off) 217 | { 218 | u64 chunk_offset; 219 | struct buse_wqueue *wq = inode_get_wqueue(file->f_inode); 220 | unsigned long ret = copy_from_user(&chunk_offset, buf, len); 221 | 222 | if (len != 8) { 223 | BUG(); 224 | return 0; 225 | } 226 | 227 | if (*off != 0) { 228 | BUG(); 229 | return 0; 230 | } 231 | 232 | if (ret) { 233 | pr_alert("Cannot copy\n"); 234 | return -ENOMEM; 235 | } 236 | 237 | ack_write_request(wq, chunk_offset, false); 238 | 239 | return len; 240 | } 241 | 242 | /* 243 | * File read() callback for write queue. Userspace reads metadata about the write chunk coming to 244 | * the block device. It is number of batched writes in the chunk and offset to the shared memory 245 | * where the chunks is located. 246 | */ 247 | ssize_t chrdev_read_wqueue(struct file *file, char __user *buf, size_t len, loff_t *off) 248 | { 249 | struct write_chunk *chunk; 250 | int ret; 251 | struct buse_wqueue *wq = inode_get_wqueue(file->f_inode); 252 | 253 | if (len != 16) { 254 | BUG(); 255 | return 0; 256 | } 257 | 258 | if (*off != 0) { 259 | BUG(); 260 | return 0; 261 | } 262 | 263 | chunk = pop_write_request_wait(wq); 264 | if (IS_ERR(chunk)) { 265 | return PTR_ERR(chunk); 266 | } 267 | 268 | ret = copy_to_user(buf, &chunk->shmem_offset, sizeof(chunk->shmem_offset)); 269 | buf += sizeof(chunk->shmem_offset); 270 | if (ret) { 271 | pr_alert("copy_to_user failed\n"); 272 | return -EFAULT; 273 | } 274 | 275 | ret = copy_to_user(buf, &chunk->num_writes, sizeof(chunk->num_writes)); 276 | if (ret) { 277 | pr_alert("copy_to_user failed\n"); 278 | return -EFAULT; 279 | } 280 | 281 | if (is_wqueue_term(chunk)) 282 | kfree(chunk); 283 | 284 | return len; 285 | } 286 | 287 | /* 288 | * File read() callback for read queue. Userspace reads metadata about the read request coming to 289 | * the block device and offset to the shared memory where data should be read into. 290 | */ 291 | ssize_t chrdev_read_rqueue(struct file *file, char __user *buf, size_t len, loff_t *off) 292 | { 293 | struct buse_rqueue *rq = inode_get_rqueue(file->f_inode); 294 | struct read_chunk *chunk; 295 | int ret; 296 | 297 | if (len != 24) { 298 | BUG(); 299 | return 0; 300 | } 301 | 302 | if (*off != 0) { 303 | BUG(); 304 | return 0; 305 | } 306 | 307 | chunk = pop_read_request_wait(rq); 308 | if (IS_ERR(chunk)) { 309 | return PTR_ERR(chunk); 310 | } 311 | 312 | ret = copy_to_user(buf, &chunk->sector, sizeof(chunk->sector)); 313 | buf += sizeof(chunk->sector); 314 | if (ret) { 315 | pr_alert("copy_to_user failed\n"); 316 | return -EFAULT; 317 | } 318 | 319 | ret = copy_to_user(buf, &chunk->len, sizeof(chunk->len)); 320 | buf += sizeof(chunk->len); 321 | if (ret) { 322 | pr_alert("copy_to_user failed\n"); 323 | return -EFAULT; 324 | } 325 | 326 | ret = copy_to_user(buf, &chunk->shmem_offset, sizeof(chunk->shmem_offset)); 327 | buf += sizeof(chunk->shmem_offset); 328 | if (ret) { 329 | pr_alert("copy_to_user failed\n"); 330 | return -EFAULT; 331 | } 332 | 333 | if (is_rqueue_term(chunk)) 334 | kfree(chunk); 335 | 336 | return len; 337 | } 338 | 339 | const struct file_operations chrdev_fops_wqueue = { 340 | .mmap = chrdev_mmap_wqueue, 341 | .open = chrdev_open_wqueue, 342 | .owner = THIS_MODULE, 343 | .read = chrdev_read_wqueue, 344 | .write = chrdev_write_wqueue, 345 | .release = chrdev_release_wqueue, 346 | }; 347 | 348 | const struct file_operations chrdev_fops_rqueue = { 349 | .mmap = chrdev_mmap_rqueue, 350 | .open = chrdev_open_rqueue, 351 | .owner = THIS_MODULE, 352 | .read = chrdev_read_rqueue, 353 | .write = chrdev_write_rqueue, 354 | .release = chrdev_release_rqueue, 355 | }; 356 | 357 | /* 358 | * Init one character device corresponding to one of the queues. 359 | */ 360 | static int chrdev_queue_init(struct buse_chrdev *chrdev, dev_t minor, char *name, 361 | int i, const struct file_operations *fops) 362 | { 363 | int ret; 364 | 365 | chrdev->region = minor; 366 | cdev_init(&chrdev->cdev, fops); 367 | ret = cdev_add(&chrdev->cdev, minor, 1); 368 | if (ret < 0) 369 | goto err; 370 | 371 | chrdev->dev = device_create(buse_chrdev_class, NULL, minor, NULL,"%s%d", name, i); 372 | if (IS_ERR(chrdev->dev)) { 373 | ret = PTR_ERR(chrdev->dev); 374 | goto err_cdev; 375 | } 376 | 377 | return 0; 378 | 379 | err_cdev: 380 | cdev_del(&chrdev->cdev); 381 | err: 382 | return ret; 383 | } 384 | 385 | static void chrdev_queue_exit(struct buse_chrdev *chrdev) 386 | { 387 | device_destroy(buse_chrdev_class, chrdev->region); 388 | cdev_del(&chrdev->cdev); 389 | } 390 | 391 | /* 392 | * Deallocated read queues related character devices. 393 | */ 394 | static void chrdev_rqueues_exit(struct buse *buse) 395 | { 396 | int i; 397 | struct buse_queue *q = buse->queues; 398 | struct buse_rqueue *rq; 399 | dev_t minor; 400 | 401 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) { 402 | rq = &q->r; 403 | minor = rq->chrdev.region; 404 | chrdev_queue_exit(&rq->chrdev); 405 | } 406 | 407 | minor -= buse->num_queues - 1; 408 | unregister_chrdev_region(minor, buse->num_queues); 409 | } 410 | 411 | /* 412 | * Deallocated write queues related character devices. 413 | */ 414 | static void chrdev_wqueues_exit(struct buse *buse) 415 | { 416 | int i; 417 | struct buse_queue *q = buse->queues; 418 | struct buse_wqueue *wq; 419 | dev_t minor; 420 | 421 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) { 422 | wq = &q->w; 423 | minor = wq->chrdev.region; 424 | chrdev_queue_exit(&wq->chrdev); 425 | } 426 | 427 | minor -= buse->num_queues - 1; 428 | unregister_chrdev_region(minor, buse->num_queues); 429 | } 430 | 431 | /* 432 | * Allocate read queues related character devices. 433 | */ 434 | static int chrdev_rqueues_init(struct buse *buse) 435 | { 436 | int ret, i; 437 | struct buse_queue *q; 438 | struct buse_rqueue *rq; 439 | dev_t minor; 440 | char name[DISK_NAME_LEN]; 441 | snprintf(name, DISK_NAME_LEN, "%s%llu-r", buse_blkdev_name, buse->index); 442 | 443 | ret = alloc_chrdev_region(&minor, 0, buse->num_queues, name); 444 | if (ret < 0) 445 | goto err; 446 | 447 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++, minor++) { 448 | rq = &q->r; 449 | ret = chrdev_queue_init(&rq->chrdev, minor, name, i, &chrdev_fops_rqueue); 450 | if (ret) 451 | goto err_alloc; 452 | } 453 | 454 | return 0; 455 | 456 | err_alloc: 457 | for (; i > 0; i--, q--, minor--) { 458 | rq = &q->r; 459 | chrdev_queue_exit(&rq->chrdev); 460 | } 461 | 462 | unregister_chrdev_region(minor, buse->num_queues); 463 | err: 464 | return ret; 465 | } 466 | 467 | /* 468 | * Allocate write queues related character devices. 469 | */ 470 | static int chrdev_wqueues_init(struct buse *buse) 471 | { 472 | int ret, i; 473 | struct buse_queue *q; 474 | struct buse_wqueue *wq; 475 | dev_t minor; 476 | char name[DISK_NAME_LEN]; 477 | snprintf(name, DISK_NAME_LEN, "%s%llu-w", buse_blkdev_name, buse->index); 478 | 479 | ret = alloc_chrdev_region(&minor, 0, buse->num_queues, name); 480 | if (ret < 0) 481 | goto err; 482 | 483 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++, minor++) { 484 | wq = &q->w; 485 | ret = chrdev_queue_init(&wq->chrdev, minor, name, i, &chrdev_fops_wqueue); 486 | if (ret) 487 | goto err_alloc; 488 | } 489 | 490 | return 0; 491 | 492 | err_alloc: 493 | for (; i > 0; i--, q--, minor--) { 494 | wq = &q->w; 495 | chrdev_queue_exit(&wq->chrdev); 496 | } 497 | 498 | unregister_chrdev_region(minor, buse->num_queues); 499 | err: 500 | return ret; 501 | } 502 | 503 | /* 504 | * Init all needed character devices for queues to the userspace. 505 | */ 506 | int buse_chrdev_init(struct buse *buse) 507 | { 508 | int ret; 509 | 510 | ret = chrdev_wqueues_init(buse); 511 | if (ret) 512 | goto err; 513 | 514 | ret = chrdev_rqueues_init(buse); 515 | if (ret) 516 | goto err_wqueues; 517 | 518 | return 0; 519 | 520 | err_wqueues: 521 | chrdev_wqueues_exit(buse); 522 | err: 523 | return ret; 524 | } 525 | 526 | void buse_chrdev_exit(struct buse *buse) 527 | { 528 | chrdev_rqueues_exit(buse); 529 | chrdev_wqueues_exit(buse); 530 | } 531 | -------------------------------------------------------------------------------- /kernel/buse-chrdev.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_CHRDEV_H 4 | #define BUSE_CHRDEV_H 5 | 6 | #include "main.h" 7 | 8 | /* 9 | * Init all needed character devices for queues to the userspace. 10 | */ 11 | int buse_chrdev_init(struct buse *buse); 12 | 13 | void buse_chrdev_exit(struct buse *buse); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /kernel/buse-configfs.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | /* 4 | * This module contains all configfs related configuration. Every configfs attribute needs to define: 5 | * 6 | * 1) buse_`attr`_show() function returning the current value. 7 | * 8 | * 2) buse_`attr`_store() setting the value and eventually doing function calls. 9 | * 10 | * 3) define macro CONFIGFS_ATTR(buse_, `attr`); 11 | * 12 | * 4) put &buse_attr_`attr` record to the buse_attrs[] 13 | * 14 | * This process can be a bit repetitive for some attributes, but we keep it like that for better 15 | * control over allowed inserted values and not obfuscating the code with unclean macros. 16 | */ 17 | 18 | #include 19 | 20 | #include "buse-configfs.h" 21 | #include "buse-rqueue.h" 22 | #include "buse-wqueue.h" 23 | #include "main.h" 24 | 25 | static inline struct buse *to_buse(struct config_item *item) 26 | { 27 | return item ? container_of(item, struct buse, item) : NULL; 28 | } 29 | 30 | static ssize_t buse_power_show(struct config_item *item, char *page) 31 | { 32 | struct buse *buse = to_buse(item); 33 | 34 | return snprintf(page, PAGE_SIZE, "%d\n", buse->power); 35 | } 36 | 37 | static ssize_t buse_power_store(struct config_item *item, const char *page, size_t count) 38 | { 39 | struct buse *buse = to_buse(item); 40 | int ret; 41 | bool power; 42 | int i; 43 | struct buse_wqueue *wq; 44 | struct buse_rqueue *rq; 45 | 46 | ret = kstrtobool(page, &power); 47 | if (ret) 48 | goto err; 49 | 50 | mutex_lock(&buse->configfs_mutex); 51 | 52 | if (power == buse->power) { 53 | ret = -EINVAL; 54 | goto err_mutex; 55 | } 56 | 57 | atomic_set(&buse->stopped, !power); 58 | 59 | if (!power) 60 | buse_stop(buse); 61 | 62 | if (power && buse->queues) { 63 | for (i = 0; i < buse->num_queues; i++) { 64 | wq = &buse->queues[i].w; 65 | mutex_lock(&wq->lock); 66 | wq->terminated = false; 67 | mutex_unlock(&wq->lock); 68 | } 69 | 70 | for (i = 0; i < buse->num_queues; i++) { 71 | rq = &buse->queues[i].r; 72 | mutex_lock(&rq->lock); 73 | rq->terminated = false; 74 | mutex_unlock(&rq->lock); 75 | } 76 | } 77 | 78 | if (power && !buse->queues) { 79 | ret = buse_on(buse); 80 | if (ret) 81 | goto err_mutex; 82 | } 83 | 84 | buse->power = power; 85 | ret = count; 86 | 87 | err_mutex: 88 | mutex_unlock(&buse->configfs_mutex); 89 | err: 90 | return ret; 91 | } 92 | 93 | CONFIGFS_ATTR(buse_, power); 94 | 95 | static ssize_t buse_hw_queues_show(struct config_item *item, char *page) 96 | { 97 | struct buse *buse = to_buse(item); 98 | 99 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->hw_queues); 100 | } 101 | 102 | static ssize_t buse_hw_queues_store(struct config_item *item, const char *page, size_t count) 103 | { 104 | struct buse *buse = to_buse(item); 105 | int ret; 106 | u64 hw_queues; 107 | 108 | if (buse->power) 109 | return -EBUSY; 110 | 111 | ret = kstrtou64(page, 0, &hw_queues); 112 | if (ret) 113 | goto err; 114 | 115 | buse->hw_queues = hw_queues; 116 | 117 | return count; 118 | 119 | err: 120 | return ret; 121 | } 122 | 123 | CONFIGFS_ATTR(buse_, hw_queues); 124 | 125 | static ssize_t buse_queue_depth_show(struct config_item *item, char *page) 126 | { 127 | struct buse *buse = to_buse(item); 128 | 129 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->queue_depth); 130 | } 131 | 132 | static ssize_t buse_queue_depth_store(struct config_item *item, const char *page, size_t count) 133 | { 134 | struct buse *buse = to_buse(item); 135 | int ret; 136 | u64 queue_depth; 137 | 138 | if (buse->power) 139 | return -EBUSY; 140 | 141 | ret = kstrtou64(page, 0, &queue_depth); 142 | if (ret) 143 | goto err; 144 | 145 | buse->queue_depth = queue_depth; 146 | 147 | return count; 148 | 149 | err: 150 | return ret; 151 | } 152 | 153 | CONFIGFS_ATTR(buse_, queue_depth); 154 | 155 | static ssize_t buse_can_write_same_show(struct config_item *item, char *page) 156 | { 157 | struct buse *buse = to_buse(item); 158 | 159 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_write_same); 160 | } 161 | 162 | static ssize_t buse_can_write_same_store(struct config_item *item, const char *page, size_t count) 163 | { 164 | struct buse *buse = to_buse(item); 165 | int ret; 166 | bool can_write_same; 167 | 168 | if (buse->power) 169 | return -EBUSY; 170 | 171 | ret = kstrtobool(page, &can_write_same); 172 | if (ret) 173 | goto err; 174 | 175 | buse->can_write_same = can_write_same; 176 | 177 | return count; 178 | 179 | err: 180 | return ret; 181 | } 182 | 183 | CONFIGFS_ATTR(buse_, can_write_same); 184 | 185 | static ssize_t buse_can_write_zeroes_show(struct config_item *item, char *page) 186 | { 187 | struct buse *buse = to_buse(item); 188 | 189 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_write_zeroes); 190 | } 191 | 192 | static ssize_t buse_can_write_zeroes_store(struct config_item *item, const char *page, size_t count) 193 | { 194 | struct buse *buse = to_buse(item); 195 | int ret; 196 | bool can_write_zeroes; 197 | 198 | if (buse->power) 199 | return -EBUSY; 200 | 201 | ret = kstrtobool(page, &can_write_zeroes); 202 | if (ret) 203 | goto err; 204 | 205 | buse->can_write_zeroes = can_write_zeroes; 206 | 207 | return count; 208 | 209 | err: 210 | return ret; 211 | } 212 | 213 | CONFIGFS_ATTR(buse_, can_write_zeroes); 214 | 215 | static ssize_t buse_can_discard_show(struct config_item *item, char *page) 216 | { 217 | struct buse *buse = to_buse(item); 218 | 219 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_discard); 220 | } 221 | 222 | static ssize_t buse_can_discard_store(struct config_item *item, const char *page, size_t count) 223 | { 224 | struct buse *buse = to_buse(item); 225 | int ret; 226 | bool can_discard; 227 | 228 | if (buse->power) 229 | return -EBUSY; 230 | 231 | ret = kstrtobool(page, &can_discard); 232 | if (ret) 233 | goto err; 234 | 235 | buse->can_discard = can_discard; 236 | 237 | return count; 238 | 239 | err: 240 | return ret; 241 | } 242 | 243 | CONFIGFS_ATTR(buse_, can_discard); 244 | 245 | static ssize_t buse_can_secure_erase_show(struct config_item *item, char *page) 246 | { 247 | struct buse *buse = to_buse(item); 248 | 249 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_secure_erase); 250 | } 251 | 252 | static ssize_t buse_can_secure_erase_store(struct config_item *item, const char *page, size_t count) 253 | { 254 | struct buse *buse = to_buse(item); 255 | int ret; 256 | bool can_secure_erase; 257 | 258 | if (buse->power) 259 | return -EBUSY; 260 | 261 | ret = kstrtobool(page, &can_secure_erase); 262 | if (ret) 263 | goto err; 264 | 265 | buse->can_secure_erase = can_secure_erase; 266 | 267 | return count; 268 | 269 | err: 270 | return ret; 271 | } 272 | 273 | CONFIGFS_ATTR(buse_, can_secure_erase); 274 | 275 | static ssize_t buse_no_scheduler_show(struct config_item *item, char *page) 276 | { 277 | struct buse *buse = to_buse(item); 278 | 279 | return snprintf(page, PAGE_SIZE, "%d\n", buse->no_scheduler); 280 | } 281 | 282 | static ssize_t buse_no_scheduler_store(struct config_item *item, const char *page, size_t count) 283 | { 284 | struct buse *buse = to_buse(item); 285 | int ret; 286 | bool no_scheduler; 287 | 288 | if (buse->power) 289 | return -EBUSY; 290 | 291 | ret = kstrtobool(page, &no_scheduler); 292 | if (ret) 293 | goto err; 294 | 295 | buse->no_scheduler = no_scheduler; 296 | 297 | return count; 298 | 299 | err: 300 | return ret; 301 | } 302 | 303 | CONFIGFS_ATTR(buse_, no_scheduler); 304 | 305 | static ssize_t buse_read_shm_size_show(struct config_item *item, char *page) 306 | { 307 | struct buse *buse = to_buse(item); 308 | 309 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->read_shm_size); 310 | } 311 | 312 | static ssize_t buse_read_shm_size_store(struct config_item *item, const char *page, size_t count) 313 | { 314 | struct buse *buse = to_buse(item); 315 | int ret; 316 | u64 read_shm_size; 317 | 318 | if (buse->power) 319 | return -EBUSY; 320 | 321 | ret = kstrtou64(page, 0, &read_shm_size); 322 | if (ret) 323 | goto err; 324 | 325 | buse->read_shm_size = read_shm_size; 326 | 327 | return count; 328 | 329 | err: 330 | return ret; 331 | } 332 | 333 | CONFIGFS_ATTR(buse_, read_shm_size); 334 | 335 | static ssize_t buse_write_shm_size_show(struct config_item *item, char *page) 336 | { 337 | struct buse *buse = to_buse(item); 338 | 339 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->write_shm_size); 340 | } 341 | 342 | static ssize_t buse_write_shm_size_store(struct config_item *item, const char *page, size_t count) 343 | { 344 | struct buse *buse = to_buse(item); 345 | int ret; 346 | u64 write_shm_size; 347 | 348 | if (buse->power) 349 | return -EBUSY; 350 | 351 | ret = kstrtou64(page, 0, &write_shm_size); 352 | if (ret) 353 | goto err; 354 | 355 | buse->write_shm_size = write_shm_size;; 356 | 357 | return count; 358 | 359 | err: 360 | return ret; 361 | } 362 | 363 | CONFIGFS_ATTR(buse_, write_shm_size); 364 | 365 | static ssize_t buse_write_chunk_size_show(struct config_item *item, char *page) 366 | { 367 | struct buse *buse = to_buse(item); 368 | 369 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->write_chunk_size); 370 | } 371 | 372 | static ssize_t buse_write_chunk_size_store(struct config_item *item, const char *page, size_t count) 373 | { 374 | struct buse *buse = to_buse(item); 375 | int ret; 376 | u64 write_chunk_size; 377 | 378 | if (buse->power) 379 | return -EBUSY; 380 | 381 | ret = kstrtou64(page, 0, &write_chunk_size); 382 | if (ret) 383 | goto err; 384 | 385 | buse->write_chunk_size = write_chunk_size; 386 | 387 | return count; 388 | 389 | err: 390 | return ret; 391 | } 392 | 393 | CONFIGFS_ATTR(buse_, write_chunk_size); 394 | 395 | static ssize_t buse_blocksize_show(struct config_item *item, char *page) 396 | { 397 | struct buse *buse = to_buse(item); 398 | 399 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->block_size); 400 | } 401 | 402 | static ssize_t buse_blocksize_store(struct config_item *item, const char *page, size_t count) 403 | { 404 | struct buse *buse = to_buse(item); 405 | int ret; 406 | u64 blocksize; 407 | 408 | if (buse->power) 409 | return -EBUSY; 410 | 411 | ret = kstrtou64(page, 0, &blocksize); 412 | if (ret) 413 | goto err; 414 | 415 | buse->block_size = blocksize; 416 | 417 | return count; 418 | 419 | err: 420 | return ret; 421 | } 422 | 423 | CONFIGFS_ATTR(buse_, blocksize); 424 | 425 | static ssize_t buse_io_min_show(struct config_item *item, char *page) 426 | { 427 | struct buse *buse = to_buse(item); 428 | 429 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->io_min); 430 | } 431 | 432 | static ssize_t buse_io_min_store(struct config_item *item, const char *page, size_t count) 433 | { 434 | struct buse *buse = to_buse(item); 435 | int ret; 436 | u64 io_min; 437 | 438 | if (buse->power) 439 | return -EBUSY; 440 | 441 | ret = kstrtou64(page, 0, &io_min); 442 | if (ret) 443 | goto err; 444 | 445 | buse->io_min = io_min; 446 | 447 | return count; 448 | 449 | err: 450 | return ret; 451 | } 452 | 453 | CONFIGFS_ATTR(buse_, io_min); 454 | 455 | static ssize_t buse_io_opt_show(struct config_item *item, char *page) 456 | { 457 | struct buse *buse = to_buse(item); 458 | 459 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->io_opt); 460 | } 461 | 462 | static ssize_t buse_io_opt_store(struct config_item *item, const char *page, size_t count) 463 | { 464 | struct buse *buse = to_buse(item); 465 | int ret; 466 | u64 io_opt; 467 | 468 | if (buse->power) 469 | return -EBUSY; 470 | 471 | ret = kstrtou64(page, 0, &io_opt); 472 | if (ret) 473 | goto err; 474 | 475 | buse->io_opt = io_opt; 476 | 477 | return count; 478 | 479 | err: 480 | return ret; 481 | } 482 | 483 | CONFIGFS_ATTR(buse_, io_opt); 484 | 485 | static ssize_t buse_size_show(struct config_item *item, char *page) 486 | { 487 | struct buse *buse = to_buse(item); 488 | 489 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->size); 490 | } 491 | 492 | static ssize_t buse_size_store(struct config_item *item, const char *page, size_t count) 493 | { 494 | struct buse *buse = to_buse(item); 495 | int ret; 496 | u64 size; 497 | 498 | if (buse->power) 499 | return -EBUSY; 500 | 501 | ret = kstrtou64(page, 0, &size); 502 | if (ret) 503 | goto err; 504 | 505 | buse->size = size; 506 | 507 | return count; 508 | 509 | err: 510 | return ret; 511 | } 512 | 513 | CONFIGFS_ATTR(buse_, size); 514 | 515 | static ssize_t buse_collision_area_size_show(struct config_item *item, char *page) 516 | { 517 | struct buse *buse = to_buse(item); 518 | 519 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->collision_area_size); 520 | } 521 | 522 | static ssize_t buse_collision_area_size_store(struct config_item *item, const char *page, size_t count) 523 | { 524 | struct buse *buse = to_buse(item); 525 | int ret; 526 | u64 collision_area_size; 527 | 528 | if (buse->power) 529 | return -EBUSY; 530 | 531 | ret = kstrtou64(page, 0, &collision_area_size); 532 | if (ret) 533 | goto err; 534 | 535 | if (collision_area_size % buse->block_size != 0 || 536 | collision_area_size > buse->size) 537 | collision_area_size = buse->block_size; 538 | 539 | buse->collision_area_size = collision_area_size; 540 | 541 | return count; 542 | 543 | err: 544 | return ret; 545 | } 546 | 547 | CONFIGFS_ATTR(buse_, collision_area_size); 548 | 549 | static struct configfs_attribute *buse_attrs[] = { 550 | &buse_attr_collision_area_size, 551 | &buse_attr_size, 552 | &buse_attr_blocksize, 553 | &buse_attr_io_min, 554 | &buse_attr_io_opt, 555 | &buse_attr_write_chunk_size, 556 | &buse_attr_write_shm_size, 557 | &buse_attr_read_shm_size, 558 | &buse_attr_hw_queues, 559 | &buse_attr_queue_depth, 560 | &buse_attr_no_scheduler, 561 | &buse_attr_can_secure_erase, 562 | &buse_attr_can_write_same, 563 | &buse_attr_can_write_zeroes, 564 | &buse_attr_can_discard, 565 | &buse_attr_power, 566 | NULL, 567 | }; 568 | 569 | static void buse_release(struct config_item *item) 570 | { 571 | struct buse *buse = to_buse(item); 572 | 573 | if (buse->power) 574 | return; 575 | } 576 | 577 | static struct configfs_item_operations buse_ops = { 578 | .release = buse_release, 579 | }; 580 | 581 | static const struct config_item_type buse_type = { 582 | .ct_item_ops = &buse_ops, 583 | .ct_attrs = buse_attrs, 584 | .ct_owner = THIS_MODULE, 585 | }; 586 | 587 | static struct config_item *buse_group_make_item(struct config_group *group, const char *name) 588 | { 589 | struct buse *buse; 590 | uint index; 591 | int ret; 592 | 593 | ret = kstrtouint(name, 0, &index); 594 | if (ret < 0) 595 | return ERR_PTR(ret); 596 | 597 | buse = buse_add(index); 598 | if (IS_ERR(buse)) 599 | return ERR_PTR(-ENOMEM); 600 | 601 | config_item_init_type_name(&buse->item, name, &buse_type); 602 | 603 | return &buse->item; 604 | } 605 | 606 | static void buse_group_drop_item(struct config_group *group, struct config_item *item) 607 | { 608 | struct buse *buse = to_buse(item); 609 | 610 | mutex_lock(&buse->configfs_mutex); 611 | 612 | if (buse->power) 613 | goto err; 614 | 615 | buse_off(buse); 616 | buse_del(buse); 617 | config_item_put(item); 618 | 619 | err: 620 | mutex_unlock(&buse->configfs_mutex); 621 | } 622 | 623 | static struct configfs_group_operations buse_group_ops = { 624 | .make_item = buse_group_make_item, 625 | .drop_item = buse_group_drop_item, 626 | }; 627 | 628 | static const struct config_item_type buse_group_type = { 629 | .ct_group_ops = &buse_group_ops, 630 | .ct_owner = THIS_MODULE, 631 | }; 632 | 633 | static struct configfs_subsystem buse_subsys = { 634 | .su_group = { 635 | .cg_item = { 636 | .ci_namebuf = "buse", 637 | .ci_type = &buse_group_type, 638 | }, 639 | }, 640 | }; 641 | 642 | /* 643 | * Initialize configfs subsystem. Later on it is used for all the operation with the kernel module. 644 | */ 645 | int buse_configfs_init(void) 646 | { 647 | int ret; 648 | 649 | config_group_init(&buse_subsys.su_group); 650 | mutex_init(&buse_subsys.su_mutex); 651 | ret = configfs_register_subsystem(&buse_subsys); 652 | if (ret) 653 | goto err; 654 | 655 | return 0; 656 | 657 | err: 658 | return ret; 659 | } 660 | 661 | /* 662 | * Deinit of configfs. 663 | */ 664 | void buse_configfs_exit(void) 665 | { 666 | configfs_unregister_subsystem(&buse_subsys); 667 | } 668 | -------------------------------------------------------------------------------- /kernel/buse-configfs.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_CONFIGFS_H 4 | #define BUSE_CONFIGFS_H 5 | 6 | /* 7 | * Initialize configfs subsystem. Later on it is used for all the operation with the kernel module. 8 | */ 9 | int buse_configfs_init(void); 10 | 11 | /* 12 | * Deinit of configfs. 13 | */ 14 | void buse_configfs_exit(void); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /kernel/buse-rqueue.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "buse-blkdev.h" 15 | #include "buse-chrdev.h" 16 | #include "buse-rqueue.h" 17 | #include "buse-wqueue.h" 18 | #include "main.h" 19 | 20 | /* 21 | * Copy data from the shared memory to the memory specified by the io request. 22 | */ 23 | static void copy_to_request(struct request *rq, char *src) 24 | { 25 | char *dst; 26 | size_t len; 27 | struct bio_vec bvec; 28 | struct req_iterator iter; 29 | 30 | rq_for_each_segment(bvec, rq, iter) { 31 | len = bvec.bv_len; 32 | dst = kmap_atomic(bvec.bv_page); 33 | memcpy(dst + bvec.bv_offset, src, len); 34 | kunmap_atomic(dst); 35 | src += len; 36 | } 37 | } 38 | 39 | /* 40 | * Acknowledge from the userspace that the read is done. If draining is true, it means that we are 41 | * shutting down and we are no longer servig to the userspace daemon. 42 | * 43 | * Data are copied from the shared memory (filled by user space) to the io request destination. Then 44 | * the bitmap tracking the free space in shared memory is updated and read requests is finished. 45 | */ 46 | void ack_read_request(struct buse_rqueue *rq, u64 shmem_offset, bool draining) 47 | { 48 | struct buse *buse = rq->buse; 49 | struct read_chunk *ch; 50 | int shmem_offset_block = shmem_offset / buse->block_size; 51 | int shmem_offset_blocks_cnt = buse->read_shm_size / buse->block_size; 52 | 53 | if (shmem_offset % buse->block_size || 54 | shmem_offset_block >= shmem_offset_blocks_cnt) { 55 | BUG(); 56 | 57 | } 58 | 59 | mutex_lock(&rq->lock); 60 | 61 | ch = rq->chunk_from_bitmap[shmem_offset_block]; 62 | rq->chunk_from_bitmap[shmem_offset_block] = NULL; 63 | 64 | /* TODO: Use blk_mq_complete_request_remote() to finish on local numa node */ 65 | 66 | copy_to_request(ch->cmd->rq, rq->shmem + shmem_offset); 67 | 68 | bitmap_release_region(rq->free_chunks_bitmap, shmem_offset_block, 69 | order_base_2(blk_rq_bytes(ch->cmd->rq) / buse->block_size)); 70 | 71 | list_del_init(&ch->list); 72 | 73 | if (draining) 74 | blk_mq_end_request(ch->cmd->rq, BLK_STS_IOERR); 75 | else 76 | blk_mq_end_request(ch->cmd->rq, BLK_STS_OK); 77 | 78 | kfree(ch); 79 | 80 | wake_up(&rq->free_chunks_avail); 81 | mutex_unlock(&rq->lock); 82 | } 83 | 84 | /* 85 | * If the read chunk is actually a termination chunk leading to device shutdown. 86 | */ 87 | bool is_rqueue_term(struct read_chunk *ch) 88 | { 89 | return ch->shmem_offset == -1; 90 | } 91 | 92 | /* 93 | * Pulls read chunk from the busy queue and returns it. If there is no read chunk in the busy queue, 94 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the 95 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and 96 | * potential rerun fetched but not yet acknowledged chunks. 97 | */ 98 | struct read_chunk *pop_read_request_wait(struct buse_rqueue *rq) 99 | { 100 | int ret; 101 | struct read_chunk *ch = NULL; 102 | 103 | ret = wait_event_interruptible(rq->busy_chunks_avail, !list_empty(&rq->busy_chunks)); 104 | if (ret < 0) 105 | return ERR_PTR(-EAGAIN); 106 | 107 | mutex_lock(&rq->lock); 108 | 109 | BUG_ON(list_empty(&rq->busy_chunks)); 110 | 111 | ch = list_first_entry(&rq->busy_chunks, struct read_chunk, list); 112 | list_del_init(&ch->list); 113 | 114 | if (!is_rqueue_term(ch)) 115 | list_add_tail(&ch->list, &rq->fetched_chunks); 116 | mutex_unlock(&rq->lock); 117 | 118 | return ch; 119 | } 120 | 121 | /* 122 | * Allocates space in shared memory for a new read chunk corresponding to the cmd. 123 | */ 124 | static struct read_chunk *create_read_chunk(struct buse_cmd *cmd) 125 | { 126 | struct buse_queue *q = cmd->queue; 127 | struct buse_rqueue *rq = &q->r; 128 | struct buse *buse = q->r.buse; 129 | struct request *r = cmd->rq; 130 | size_t len = blk_rq_sectors(r); 131 | size_t sector = blk_rq_pos(r); 132 | int chunk_index; 133 | int ret; 134 | struct read_chunk *ch; 135 | 136 | size_t shmem_blocks = buse->read_shm_size / buse->block_size; 137 | 138 | chunk_index = bitmap_find_free_region(rq->free_chunks_bitmap, shmem_blocks, order_base_2(len * SECTOR_SIZE / buse->block_size)); 139 | if (chunk_index < 0) { 140 | ret = -EFAULT; 141 | goto err; 142 | } 143 | 144 | ch = kmalloc(sizeof(*ch), GFP_KERNEL); 145 | if (!ch) { 146 | ret = -ENOMEM; 147 | goto err_bitmap; 148 | } 149 | 150 | ch->len = len; 151 | ch->sector = sector; 152 | ch->cmd = cmd; 153 | ch->shmem_offset = chunk_index * buse->block_size; 154 | rq->chunk_from_bitmap[chunk_index] = ch; 155 | 156 | return ch; 157 | 158 | err_bitmap: 159 | bitmap_release_region(rq->free_chunks_bitmap, chunk_index, order_base_2(len * SECTOR_SIZE / buse->block_size)); 160 | err: 161 | return ERR_PTR(ret); 162 | } 163 | 164 | /* 165 | * Creates a read chunk and puts it to the busy queue. The chunks is fetched from the busy queue by 166 | * the user space. The busy queue is woken up, in case it slept. 167 | */ 168 | blk_status_t buse_read_plain(struct buse_cmd *cmd) 169 | { 170 | struct buse_queue *q = cmd->queue; 171 | struct buse_rqueue *rq = &q->r; 172 | struct read_chunk *ch; 173 | struct buse *buse = rq->buse; 174 | size_t len = (u64)blk_rq_bytes(cmd->rq) / buse->block_size; 175 | 176 | again: 177 | if (cmd->canceled) { 178 | blk_mq_end_request(cmd->rq, BLK_STS_IOERR); 179 | return BLK_STS_IOERR; 180 | } 181 | 182 | mutex_lock(&rq->lock); 183 | 184 | if (rq->terminated) { 185 | blk_mq_end_request(cmd->rq, BLK_STS_IOERR); 186 | mutex_unlock(&rq->lock); 187 | return BLK_STS_IOERR; 188 | } 189 | 190 | ch = create_read_chunk(cmd); 191 | if (IS_ERR(ch)) { 192 | size_t shmem_blocks = buse->read_shm_size / buse->block_size; 193 | mutex_unlock(&rq->lock); 194 | wait_event_interruptible(rq->free_chunks_avail, 195 | bitmap_find_next_zero_area(rq->free_chunks_bitmap, shmem_blocks, 0, len, 0) 196 | < shmem_blocks - len); 197 | goto again; 198 | } 199 | 200 | list_add_tail(&ch->list, &rq->busy_chunks); 201 | blk_mq_start_request(cmd->rq); 202 | wake_up(&rq->busy_chunks_avail); 203 | mutex_unlock(&rq->lock); 204 | 205 | return BLK_STS_OK; 206 | } 207 | 208 | /* 209 | * Sends termination chunk to the rq. 210 | */ 211 | void rqueue_send_term(struct buse_rqueue *rq) 212 | { 213 | struct read_chunk *ch; 214 | size_t shmem_blocks = rq->buse->read_shm_size / rq->buse->block_size; 215 | again: 216 | mutex_lock(&rq->lock); 217 | 218 | if (!bitmap_empty(rq->free_chunks_bitmap, shmem_blocks)) { 219 | mutex_unlock(&rq->lock); 220 | wait_event_interruptible(rq->free_chunks_avail, bitmap_empty(rq->free_chunks_bitmap, shmem_blocks)); 221 | goto again; 222 | } 223 | 224 | 225 | if (wq_has_sleeper(&rq->free_chunks_avail)) { 226 | wake_up(&rq->free_chunks_avail); 227 | mutex_unlock(&rq->lock); 228 | goto again; 229 | } 230 | 231 | ch = kzalloc(sizeof(*ch), GFP_KERNEL); 232 | if (!ch) { 233 | pr_alert("Cannot allocate termination packet! Check traffic and shut down manually!\n"); 234 | return; 235 | } 236 | 237 | ch->shmem_offset = (u64)-1; 238 | list_add_tail(&ch->list, &rq->busy_chunks); 239 | rq->terminated = true; 240 | wake_up(&rq->busy_chunks_avail); 241 | 242 | mutex_unlock(&rq->lock); 243 | } 244 | 245 | static bool overlaps(size_t x, size_t x_len, size_t y, size_t y_len) 246 | { 247 | return ((x <= y && x + x_len > y) || (x >= y && x < y + y_len)); 248 | } 249 | 250 | /* 251 | * Adds a dependent read to the write chunk. When that write chunk is acknowledged, all dependent 252 | * reads are allowed to be send to userspace. 253 | */ 254 | static int read_dep_add(struct buse_cmd *cmd, struct write_chunk *ch) 255 | { 256 | struct rq_node *node = kzalloc(sizeof(*node), GFP_KERNEL); 257 | if (!node) 258 | return -ENOMEM; 259 | 260 | node->rq = cmd->rq; 261 | atomic_inc(&cmd->read.write_deps); 262 | list_add_tail(&node->list, &ch->dependent_reads); 263 | 264 | return 0; 265 | } 266 | 267 | /* 268 | * Checks if the command has conflict with any of writes in the write chunk. Conflict means that 269 | * read reads data written be the write. This is read after write hazard. 270 | */ 271 | static bool is_read_dep_conflict(struct buse_cmd *cmd, struct write_chunk *ch) 272 | { 273 | size_t sector = blk_rq_pos(cmd->rq); 274 | size_t len = blk_rq_sectors(cmd->rq); 275 | int i; 276 | struct writelist_item *w; 277 | 278 | if (!ch || is_flush_packet(ch)) 279 | return false; 280 | 281 | w = ch->writelist_frontier - ch->num_writes; 282 | for (i = 0; i < ch->num_writes; i++, w++) 283 | if (overlaps(sector, len, w->sector, w->len)) 284 | return true; 285 | 286 | return false; 287 | } 288 | 289 | /* 290 | * First checks for read after write hazards and add potential conflicting reads to the appropriate 291 | * write chunks. If no conflict was found and there is no more queues to check the read processed. 292 | * Otherwise the read is processed as a callback when depending write chunk is acknowledged. 293 | */ 294 | static int rqueue_read_checked(void *data) 295 | { 296 | int ret; 297 | struct cmd_q_args *args = data; 298 | struct buse_cmd *cmd = args->cmd; 299 | struct buse_queue *q = args->q; 300 | struct buse_wqueue *wq = &q->w; 301 | struct write_chunk *ch; 302 | 303 | mutex_lock(&wq->lock); 304 | list_for_each_entry(ch, &wq->busy_chunks, list) 305 | if (is_read_dep_conflict(cmd, ch)) { 306 | ret = read_dep_add(cmd, ch); 307 | if (ret) { 308 | pr_alert("Cannot add read dep from busy_chunks\n"); 309 | goto err; 310 | } 311 | } 312 | 313 | list_for_each_entry(ch, &wq->fetched_chunks, list) 314 | if (is_read_dep_conflict(cmd, ch)) { 315 | ret = read_dep_add(cmd, ch); 316 | if (ret) { 317 | pr_alert("Cannot add read dep from fetched_chunks\n"); 318 | goto err; 319 | } 320 | } 321 | 322 | if (is_read_dep_conflict(cmd, wq->active_chunk)) { 323 | ret = read_dep_add(cmd, wq->active_chunk); 324 | if (ret) { 325 | pr_alert("Cannot add read dep from active_chunk\n"); 326 | goto err; 327 | } 328 | close_chunk(wq); 329 | } 330 | 331 | goto ret; 332 | 333 | err: 334 | cmd->canceled = true; 335 | ret: 336 | mutex_unlock(&wq->lock); 337 | if (atomic_dec_and_test(&args->cmd->read.queues_pending) && 338 | atomic_read(&args->cmd->read.write_deps) == 0 && 339 | atomic_cmpxchg(&args->cmd->read.queues_pending, 0, 1) == 0) { 340 | buse_read_plain(args->cmd); 341 | } 342 | 343 | kfree(data); 344 | 345 | /* Here it depends on whether sequential or threaded version is used. */ 346 | return 0; /* For sequential version */ 347 | /* do_exit(0); */ /* For threaded version */ 348 | } 349 | 350 | /* 351 | * Spawns checked reads on all queues. 352 | */ 353 | blk_status_t buse_read(struct buse_cmd *cmd) 354 | { 355 | int i; 356 | struct cmd_q_args *args; 357 | struct buse_queue *q = cmd->queue; 358 | struct buse *buse = q->r.buse; 359 | size_t num_queues = buse->num_queues; 360 | 361 | atomic_set(&cmd->read.write_deps, 0); 362 | atomic_set(&cmd->read.queues_pending, num_queues); 363 | 364 | for (i = 0; i < num_queues; i++) { 365 | args = kzalloc(sizeof(*args), GFP_KERNEL); 366 | if (!args) 367 | goto err; 368 | 369 | args->cmd = cmd; 370 | args->q = &buse->queues[i]; 371 | 372 | 373 | /* 374 | * Asynchronous version 375 | * if (kthread_run(rqueue_read_checked, args, "buse-queue_read_checked_th%d", i) < 0) { 376 | * pr_alert("Cannot spawn rqueue_read_checked thread!\n"); 377 | * goto err_args; 378 | * } 379 | */ 380 | 381 | rqueue_read_checked(args); /* Sequential version */ 382 | } 383 | 384 | return BLK_STS_OK; 385 | 386 | /* err_args: */ 387 | kfree(args); 388 | err: 389 | atomic_sub(num_queues - i, &cmd->read.queues_pending); 390 | cmd->canceled = true; 391 | 392 | if (!i) 393 | return BLK_STS_AGAIN; 394 | 395 | return BLK_STS_OK; 396 | } 397 | 398 | /* 399 | * Drains all the queues because the is shutting down non-gracefully and we don't want memory leaks. 400 | */ 401 | static void rqueue_drain(struct buse_rqueue *rq) 402 | { 403 | struct read_chunk *chunk; 404 | uint r_chunks = rq->buse->read_shm_size / rq->buse->block_size; 405 | int i; 406 | 407 | for (i = 0; i < r_chunks; i++) { 408 | size_t offset = i * rq->buse->block_size; 409 | if (rq->chunk_from_bitmap[i]) 410 | ack_read_request(rq, offset, true); 411 | } 412 | 413 | while (!list_empty(&rq->busy_chunks)) { 414 | chunk = list_first_entry(&rq->busy_chunks, struct read_chunk, list); 415 | mutex_unlock(&rq->lock); 416 | if (is_rqueue_term(chunk)) { 417 | mutex_lock(&rq->lock); 418 | list_del_init(&chunk->list); 419 | mutex_unlock(&rq->lock); 420 | kfree(chunk); 421 | } else 422 | ack_read_request(rq, chunk->shmem_offset, true); 423 | mutex_lock(&rq->lock); 424 | } 425 | } 426 | 427 | /* 428 | * Deallocates the read queue. 429 | */ 430 | static void rqueue_exit(struct buse_rqueue *rq) 431 | { 432 | rqueue_drain(rq); 433 | kfree(rq->chunk_from_bitmap); 434 | bitmap_free(rq->free_chunks_bitmap); 435 | vfree(rq->shmem); 436 | } 437 | 438 | /* 439 | * Allocates the read queue. 440 | */ 441 | static int rqueue_init(struct buse_rqueue *rq) 442 | { 443 | int ret; 444 | struct buse *buse = rq->buse; 445 | uint r_chunks = buse->read_shm_size / buse->block_size; 446 | int numa_node = buse_get_numa_node_for_queue_id(rq->buse, rq->q->id); 447 | 448 | init_waitqueue_head(&rq->busy_chunks_avail); 449 | init_waitqueue_head(&rq->free_chunks_avail); 450 | INIT_LIST_HEAD(&rq->busy_chunks); 451 | INIT_LIST_HEAD(&rq->fetched_chunks); 452 | mutex_init(&rq->lock); 453 | 454 | rq->size = buse->read_shm_size; 455 | 456 | rq->shmem = vmalloc_node(rq->size, numa_node); 457 | if (!rq->shmem) { 458 | ret = -ENOMEM; 459 | goto err; 460 | } 461 | 462 | rq->free_chunks_bitmap = kmalloc_array_node(BITS_TO_LONGS(r_chunks), sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO, numa_node); 463 | if (!rq->free_chunks_bitmap) { 464 | ret = -ENOMEM; 465 | goto err_shmem; 466 | } 467 | 468 | rq->chunk_from_bitmap = kcalloc_node(r_chunks, sizeof(struct read_chunk *), GFP_KERNEL, numa_node); 469 | if (!rq->chunk_from_bitmap) { 470 | ret = -ENOMEM; 471 | goto err_bitmap; 472 | } 473 | 474 | return 0; 475 | 476 | err_bitmap: 477 | bitmap_free(rq->free_chunks_bitmap); 478 | err_shmem: 479 | vfree(rq->shmem); 480 | err: 481 | return ret; 482 | } 483 | 484 | /* 485 | * Init all read queues. 486 | */ 487 | int buse_rqueues_init(struct buse *buse) 488 | { 489 | int ret, i; 490 | struct buse_queue *q; 491 | 492 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) { 493 | q->r.buse = buse; 494 | q->r.q = q; 495 | ret = rqueue_init(&q->r); 496 | if (ret) { 497 | i++; 498 | q++; 499 | goto err; 500 | } 501 | } 502 | 503 | return 0; 504 | 505 | err: 506 | for (i--, q--; i > 0; i--, q--) 507 | rqueue_exit(&q->r); 508 | 509 | return ret; 510 | } 511 | 512 | /* 513 | * Deinit all read queues. 514 | */ 515 | int buse_rqueues_exit(struct buse *buse) 516 | { 517 | int i; 518 | struct buse_queue *q; 519 | 520 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) 521 | rqueue_exit(&q->r); 522 | 523 | return 0; 524 | } 525 | 526 | /* 527 | * Rerun all fetched chunks by the user space again. This is called when user space failes without 528 | * acknowledging read chunks and reconnects again. 529 | */ 530 | static void rerun_read_chunks(struct buse_rqueue *rq) 531 | { 532 | struct read_chunk *ch; 533 | 534 | mutex_lock(&rq->lock); 535 | while (!list_empty(&rq->fetched_chunks)) { 536 | ch = list_last_entry(&rq->fetched_chunks, struct read_chunk, list); 537 | list_del_init(&ch->list); 538 | list_add(&ch->list, &rq->busy_chunks); 539 | } 540 | wake_up(&rq->busy_chunks_avail); 541 | mutex_unlock(&rq->lock); 542 | } 543 | 544 | /* 545 | * Set the queue to be bound. 546 | */ 547 | void buse_rqueue_bind(struct buse_rqueue *rq) 548 | { 549 | mutex_lock(&rq->buse->configfs_mutex); 550 | atomic_set(&rq->bound, 1); 551 | mutex_unlock(&rq->buse->configfs_mutex); 552 | buse_blkdev_init_cond(rq->buse); 553 | rerun_read_chunks(rq); 554 | } 555 | 556 | /* 557 | * Returns true if all read queues are bound. I.e. have connected the userspace counterpart. 558 | */ 559 | bool buse_rqueues_bound(struct buse *buse) 560 | { 561 | int i; 562 | struct buse_rqueue *rq; 563 | 564 | for (i = 0; i < buse->num_queues; i++) { 565 | rq = &buse->queues[i].r; 566 | if (atomic_read(&rq->bound) == 0) 567 | return false; 568 | } 569 | 570 | return true; 571 | } 572 | -------------------------------------------------------------------------------- /kernel/buse-rqueue.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_RQUEUE_H 4 | #define BUSE_RQUEUE_H 5 | 6 | #include 7 | #include "main.h" 8 | 9 | /* 10 | * Spawns checked reads on all queues. 11 | */ 12 | blk_status_t buse_read(struct buse_cmd *cmd); 13 | 14 | /* 15 | * Creates a read chunk and puts it to the busy queue. The chunks is fetched from the busy queue by 16 | * the user space. The busy queue is woken up, in case it slept. 17 | */ 18 | blk_status_t buse_read_plain(struct buse_cmd *cmd); 19 | 20 | /* 21 | * Init all read queues. 22 | */ 23 | int buse_rqueues_init(struct buse *buse); 24 | 25 | /* 26 | * Deinit all read queues. 27 | */ 28 | int buse_rqueues_exit(struct buse *buse); 29 | 30 | /* 31 | * Pulls read chunk from the busy queue and returns it. If there is no read chunk in the busy queue, 32 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the 33 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and 34 | * potential rerun fetched but not yet acknowledged chunks. 35 | */ 36 | struct read_chunk *pop_read_request_wait(struct buse_rqueue *rq); 37 | 38 | /* 39 | * Acknowledge from the userspace that the read is done. If draining is true, it means that we are 40 | * shutting down and we are no longer servig to the userspace daemon. 41 | * 42 | * Data are copied from the shared memory (filled by user space) to the io request destination. Then 43 | * the bitmap tracking the free space in shared memory is updated and read requests is finished. 44 | */ 45 | void ack_read_request(struct buse_rqueue *rqueue, u64 shmem_offset, bool draining); 46 | 47 | /* 48 | * Returns true if all read queues are bound. I.e. have connected the userspace counterpart. 49 | */ 50 | bool buse_rqueues_bound(struct buse *buse); 51 | 52 | /* 53 | * Set the queue to be bound. 54 | */ 55 | void buse_rqueue_bind(struct buse_rqueue *rq); 56 | 57 | /* 58 | * Sends termination chunk to the rq. 59 | */ 60 | void rqueue_send_term(struct buse_rqueue *rq); 61 | 62 | 63 | /* 64 | * If the read chunk is actually a termination chunk leading to device shutdown. 65 | */ 66 | bool is_rqueue_term(struct read_chunk *ch); 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /kernel/buse-wqueue.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "buse-blkdev.h" 13 | #include "buse-chrdev.h" 14 | #include "buse-rqueue.h" 15 | #include "buse-wqueue.h" 16 | #include "main.h" 17 | 18 | static bool valid_buse_cmd(struct buse_cmd *cmd) 19 | { 20 | return cmd->magic == BUSE_MAGIC; 21 | } 22 | 23 | /* 24 | * Finalizer for flush chunk when it is acknowledged from the user space. 25 | */ 26 | static void flush_finalize(struct write_chunk *ch, struct buse_wqueue *wq, bool draining) 27 | { 28 | struct buse_cmd *cmd = ch->cmd; 29 | 30 | if (!valid_buse_cmd(cmd)) { 31 | pr_debug("Invalid flush cmd!\n"); 32 | return; 33 | } 34 | 35 | mutex_lock(&wq->lock); 36 | list_del_init(&ch->list); 37 | mutex_unlock(&wq->lock); 38 | 39 | kfree(ch); 40 | 41 | if (atomic_dec_and_test(&cmd->flush.queues_pending)) { 42 | if (draining) 43 | blk_mq_end_request(cmd->rq, 44 | cmd->canceled ? BLK_STS_IOERR : BLK_STS_OK); 45 | else 46 | blk_mq_end_request(cmd->rq, 47 | cmd->canceled ? BLK_STS_AGAIN : BLK_STS_OK); 48 | } 49 | } 50 | 51 | static size_t chunk_index(struct buse_wqueue *wq, struct write_chunk *wc) 52 | { 53 | void *chunks_ = wq->chunks; 54 | void *wc_ = wc; 55 | 56 | return (wc_ - chunks_) / sizeof(*wc); 57 | } 58 | 59 | /* 60 | * Initialize write chunk structure. 61 | */ 62 | static void init_write_chunk(struct buse_wqueue *wq, struct write_chunk *ch) 63 | { 64 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size; 65 | u64 i = chunk_index(wq, ch); 66 | 67 | ch->shmem_offset = i * wq->buse->write_chunk_size; 68 | ch->writelist_frontier = wq->shmem + ch->shmem_offset; 69 | ch->data_frontier = ch->writelist_frontier + max_writes; 70 | ch->num_writes = 0; 71 | 72 | INIT_LIST_HEAD(&ch->dependent_reads); 73 | 74 | mutex_lock(&wq->lock); 75 | list_add_tail(&ch->list, &wq->free_chunks); 76 | mutex_unlock(&wq->lock); 77 | } 78 | 79 | /* 80 | * Finalizer for write chunk. It initiates read on all dependent reads from the read after write 81 | * hazard check. Then it just recycle the write chunk for future usage. 82 | */ 83 | static int write_finalize(struct write_chunk *ch, struct buse_wqueue *wq) 84 | { 85 | struct rq_node *rq; 86 | struct buse_cmd *cmd; 87 | 88 | mutex_lock(&wq->lock); 89 | 90 | /* Remove from fetched list */ 91 | list_del_init(&ch->list); 92 | 93 | mutex_unlock(&wq->lock); 94 | 95 | while (!list_empty(&ch->dependent_reads)) { 96 | rq = list_first_entry(&ch->dependent_reads, struct rq_node, list); 97 | cmd = blk_mq_rq_to_pdu(rq->rq); 98 | if (atomic_dec_and_test(&cmd->read.write_deps) && 99 | atomic_read(&cmd->read.queues_pending) == 0 && 100 | atomic_cmpxchg(&cmd->read.queues_pending, 0, 1) == 0) { 101 | 102 | buse_read_plain(cmd); 103 | } 104 | 105 | list_del_init(&rq->list); 106 | kfree(rq); 107 | } 108 | 109 | init_write_chunk(wq, ch); 110 | wake_up(&wq->free_chunks_avail); 111 | 112 | return 0; 113 | } 114 | 115 | static bool is_flush_offset(u64 offset) 116 | { 117 | return offset > (1UL << 32); 118 | } 119 | 120 | bool is_flush_packet(struct write_chunk *wc) 121 | { 122 | return is_flush_offset(wc->shmem_offset); 123 | } 124 | 125 | /* 126 | * When userspace acknowledge the write chunk we perform appropriate actions based on the write 127 | * chunk type. 128 | */ 129 | void ack_write_request(struct buse_wqueue *wq, u64 chunk_offset, bool draining) 130 | { 131 | if (is_flush_offset(chunk_offset)) 132 | flush_finalize((struct write_chunk *)chunk_offset, wq, draining); 133 | else { 134 | struct write_chunk *ch; 135 | u64 chunk_index = chunk_offset / wq->buse->write_chunk_size; 136 | uint chunks_total = wq->buse->write_shm_size / wq->buse->write_chunk_size; 137 | 138 | if (chunk_offset % wq->buse->write_chunk_size || 139 | chunk_index >= chunks_total) { 140 | BUG(); 141 | } 142 | 143 | ch = &wq->chunks[chunk_index]; 144 | write_finalize(ch, wq); 145 | } 146 | } 147 | 148 | /* 149 | * Pulls write chunk from the busy queue and returns it. If there is no write chunk in the busy queue, 150 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the 151 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and 152 | * potential rerun fetched but not yet acknowledged chunks. 153 | */ 154 | struct write_chunk *pop_write_request_wait(struct buse_wqueue *wq) 155 | { 156 | struct write_chunk *ch = NULL; 157 | int ret; 158 | 159 | ret = wait_event_interruptible(wq->busy_chunks_avail, !list_empty(&wq->busy_chunks)); 160 | if (ret < 0) 161 | return ERR_PTR(-EAGAIN); 162 | 163 | mutex_lock(&wq->lock); 164 | 165 | BUG_ON(list_empty(&wq->busy_chunks)); 166 | 167 | ch = list_first_entry(&wq->busy_chunks, struct write_chunk, list); 168 | list_del_init(&ch->list); 169 | 170 | if (!is_wqueue_term(ch)) 171 | list_add_tail(&ch->list, &wq->fetched_chunks); 172 | 173 | mutex_unlock(&wq->lock); 174 | 175 | return ch; 176 | } 177 | 178 | /* 179 | * Closes active chunk of the queue, i.e. no more writes can be written to the chunk and a new 180 | * chunks has to be opened. This usually means that flush happened or the chunk is full. 181 | */ 182 | int close_chunk(struct buse_wqueue *wq) 183 | { 184 | struct write_chunk *ch = wq->active_chunk; 185 | 186 | if (!ch || !ch->num_writes) 187 | goto end; 188 | 189 | list_add_tail(&ch->list, &wq->busy_chunks); 190 | wq->active_chunk = NULL; 191 | 192 | wake_up(&wq->busy_chunks_avail); 193 | 194 | end: 195 | return 0; 196 | } 197 | 198 | /* 199 | * Opens new active chunk if there is any free chunk. 200 | */ 201 | int open_chunk(struct buse_wqueue *wq) 202 | { 203 | BUG_ON(wq->active_chunk); 204 | 205 | if (list_empty(&wq->free_chunks)) 206 | return -EFAULT; 207 | 208 | wq->active_chunk = list_first_entry(&wq->free_chunks, struct write_chunk, list); 209 | list_del_init(&wq->active_chunk->list); 210 | 211 | return 0; 212 | } 213 | 214 | /* 215 | * Returns amount of free bytes in the chunk. 216 | */ 217 | static size_t chunk_free_bytes(struct buse_wqueue *wq, struct write_chunk *ch) 218 | { 219 | void *end = wq->shmem + ch->shmem_offset + wq->buse->write_chunk_size; 220 | return end - ch->data_frontier; 221 | } 222 | 223 | /* 224 | * Splits long writes to multiple writes not crossing the collision areas boundary and adds the 225 | * sequential number to each write. 226 | */ 227 | static void divide_add_collision(struct buse_cmd *cmd, struct write_chunk *ch) 228 | { 229 | struct buse* buse = cmd->queue->w.buse; 230 | size_t offset = blk_rq_pos(cmd->rq) * SECTOR_SIZE; 231 | s64 size = blk_rq_bytes(cmd->rq); 232 | size_t col_size = buse->collision_area_size; 233 | struct writelist_item write; 234 | size_t flag = req_op(cmd->rq); 235 | 236 | size_t new_size = round_up(offset+1, col_size) - offset; 237 | size_t col_id = offset / col_size; 238 | u64 id = atomic_add_return(1, &buse->collision_counters[col_id]); 239 | if (new_size > size) 240 | new_size = size; 241 | 242 | 243 | write.sector = offset / SECTOR_SIZE; 244 | write.len = new_size / SECTOR_SIZE; 245 | write.id = id; 246 | write.flag = flag; 247 | memcpy(ch->writelist_frontier, &write, sizeof(write)); 248 | ch->writelist_frontier++; 249 | ch->num_writes++; 250 | 251 | offset += new_size; 252 | size -= new_size; 253 | 254 | for (; size > 0; size -= col_size, offset += col_size) { 255 | size_t col_id = offset / col_size; 256 | u64 id = atomic_add_return(1, &buse->collision_counters[col_id]); 257 | write.sector = offset / SECTOR_SIZE; 258 | write.len = col_size / SECTOR_SIZE; 259 | if (size < col_size) 260 | write.len = size / SECTOR_SIZE; 261 | write.id = id; 262 | write.flag = flag; 263 | memcpy(ch->writelist_frontier, &write, sizeof(write)); 264 | ch->writelist_frontier++; 265 | ch->num_writes++; 266 | } 267 | } 268 | 269 | /* 270 | * Copy data to the shared memory from the memory specified by the io request. 271 | */ 272 | static void copy_to_chunk(struct buse_cmd *cmd, struct write_chunk *ch) 273 | { 274 | char *src; 275 | size_t len; 276 | struct bio_vec bvec; 277 | struct req_iterator iter; 278 | struct request *rq = cmd->rq; 279 | 280 | divide_add_collision(cmd, ch); 281 | 282 | if (req_op(rq) == REQ_OP_WRITE) { 283 | rq_for_each_segment(bvec, rq, iter) { 284 | len = bvec.bv_len; 285 | src = kmap_atomic(bvec.bv_page); 286 | memcpy(ch->data_frontier, src + bvec.bv_offset, len); 287 | kunmap_atomic(src); 288 | ch->data_frontier += len; 289 | } 290 | } 291 | } 292 | 293 | /* 294 | * Compute number of needed slots in the metadata area of the write chunk since the write can be 295 | * split into multiple writes. 296 | */ 297 | static size_t needed_slots(struct buse_cmd *cmd) 298 | { 299 | size_t size = blk_rq_bytes(cmd->rq); 300 | struct buse *buse = cmd->queue->w.buse; 301 | 302 | /* Upper bound of the crossing areas. */ 303 | return size / buse->collision_area_size + 2; 304 | } 305 | 306 | /* 307 | * Number of free write metadata slots in the chunk. 308 | */ 309 | static size_t chunk_free_slots(struct buse_wqueue * wq, struct write_chunk *ch) 310 | { 311 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size; 312 | return max_writes - ch->num_writes; 313 | } 314 | 315 | /* 316 | * True if the chunk is termination chunk. 317 | */ 318 | bool is_wqueue_term(struct write_chunk *ch) 319 | { 320 | return ch->shmem_offset == -1; 321 | } 322 | 323 | /* 324 | * Sends termination chunk to the write queue. 325 | */ 326 | void wqueue_send_term(struct buse_wqueue *wq) 327 | { 328 | struct write_chunk *fake_chunk; 329 | 330 | again: 331 | mutex_lock(&wq->lock); 332 | 333 | if (!list_empty(&wq->busy_chunks)) { 334 | mutex_unlock(&wq->lock); 335 | wait_event_interruptible(wq->free_chunks_avail, list_empty(&wq->busy_chunks)); 336 | goto again; 337 | } 338 | 339 | if (wq_has_sleeper(&wq->free_chunks_avail)) { 340 | wake_up(&wq->free_chunks_avail); 341 | mutex_unlock(&wq->lock); 342 | goto again; 343 | } 344 | 345 | fake_chunk = kzalloc(sizeof(*fake_chunk), GFP_KERNEL); 346 | if (!fake_chunk) { 347 | pr_debug("Cannot allocate for term uspace_packet!\n"); 348 | return; 349 | } 350 | 351 | fake_chunk->shmem_offset = (u64)-1; 352 | 353 | close_chunk(wq); 354 | list_add_tail(&fake_chunk->list, &wq->busy_chunks); 355 | wq->terminated = true; 356 | wake_up(&wq->busy_chunks_avail); 357 | 358 | mutex_unlock(&wq->lock); 359 | } 360 | 361 | /* 362 | * Copies data to the active chunk and immediately acknowledge the write request. 363 | */ 364 | blk_status_t buse_write(struct buse_cmd *cmd) 365 | { 366 | struct buse_queue *q = cmd->queue; 367 | struct buse_wqueue *wq = &q->w; 368 | struct request *rq = cmd->rq; 369 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size; 370 | 371 | if (req_op(rq) == REQ_OP_WRITE) 372 | BUG_ON(blk_rq_bytes(rq) > wq->buse->write_chunk_size - max_writes * sizeof(struct writelist_item)); 373 | 374 | again: 375 | if (cmd->canceled) 376 | return BLK_STS_IOERR; 377 | 378 | mutex_lock(&wq->lock); 379 | 380 | if (wq->terminated) { 381 | mutex_unlock(&wq->lock); 382 | return BLK_STS_IOERR; 383 | } 384 | 385 | if (wq->active_chunk && 386 | (chunk_free_bytes(wq, wq->active_chunk) < blk_rq_bytes(rq) || 387 | chunk_free_slots(wq, wq->active_chunk) < needed_slots(cmd))) 388 | close_chunk(wq); 389 | 390 | if (!wq->active_chunk && open_chunk(wq) < 0) { 391 | mutex_unlock(&wq->lock); 392 | wait_event_interruptible(wq->free_chunks_avail, !list_empty(&wq->free_chunks)); 393 | goto again; 394 | } 395 | 396 | blk_mq_start_request(rq); 397 | BUG_ON(wq->active_chunk->num_writes > max_writes); 398 | copy_to_chunk(cmd, wq->active_chunk); 399 | mutex_unlock(&wq->lock); 400 | 401 | blk_mq_end_request(rq, BLK_STS_OK); 402 | 403 | return BLK_STS_OK; 404 | } 405 | 406 | /* 407 | * Send flush chunk to the queue. 408 | */ 409 | static int send_flush(struct buse_wqueue* wq, struct buse_cmd *cmd) 410 | { 411 | struct write_chunk *fake_chunk = kmalloc(sizeof(*fake_chunk), GFP_KERNEL); 412 | if (!fake_chunk) { 413 | pr_debug("Cannot allocate for flush uspace_packet!\n"); 414 | return -1; 415 | } 416 | 417 | fake_chunk->shmem_offset = (u64)fake_chunk; 418 | fake_chunk->num_writes = (u64)fake_chunk; 419 | fake_chunk->cmd = cmd; 420 | 421 | list_add_tail(&fake_chunk->list, &wq->busy_chunks); 422 | wake_up(&wq->busy_chunks_avail); 423 | 424 | return 0; 425 | } 426 | 427 | /* 428 | * Per queue flush operation. Closes active chunk and immediately after it sends the flush chunk. 429 | */ 430 | static int wqueue_flush(void *data) 431 | { 432 | struct cmd_q_args *args = data; 433 | struct buse_wqueue *wq = &args->q->w; 434 | struct buse_cmd *cmd = args->cmd; 435 | 436 | mutex_lock(&wq->lock); 437 | close_chunk(wq); 438 | if (send_flush(wq, args->cmd) == -1) { 439 | pr_debug("Cannot send flush packet from flusher!\n"); 440 | cmd->canceled = true; 441 | if (atomic_dec_and_test(&cmd->flush.queues_pending)) { 442 | blk_mq_start_request(cmd->rq); 443 | blk_mq_end_request(cmd->rq, BLK_STS_AGAIN); 444 | } 445 | } 446 | mutex_unlock(&wq->lock); 447 | 448 | kfree(data); 449 | do_exit(0); 450 | } 451 | 452 | /* 453 | * Flush operation. It broadcasts flush to all queues. 454 | */ 455 | blk_status_t buse_flush(struct buse_cmd *cmd) 456 | { 457 | int i; 458 | struct cmd_q_args *args; 459 | struct buse_queue *q = cmd->queue; 460 | struct buse *buse = q->w.buse; 461 | size_t num_queues = buse->num_queues; 462 | 463 | atomic_set(&cmd->flush.queues_pending, num_queues); 464 | 465 | for (i = 0; i < num_queues; i++) { 466 | args = kzalloc(sizeof(*args), GFP_KERNEL); 467 | if (!args) { 468 | pr_debug("Cannot allocate!\n"); 469 | goto err; 470 | } 471 | 472 | args->cmd = cmd; 473 | args->q = &buse->queues[i]; 474 | 475 | if (kthread_run(wqueue_flush, args, "buse-flush%d", i) < 0) { 476 | pr_alert("Cannot spawn wqueue_flush thread!\n"); 477 | goto err_args; 478 | } 479 | } 480 | 481 | return BLK_STS_OK; 482 | 483 | err_args: 484 | kfree(args); 485 | err: 486 | atomic_sub(num_queues - i, &cmd->flush.queues_pending); 487 | cmd->canceled = true; 488 | 489 | if (!i) 490 | return BLK_STS_AGAIN; 491 | 492 | return BLK_STS_OK; 493 | } 494 | 495 | /* 496 | * Another implementation of the flush logic. This one does flush broadcasting sequentially without 497 | * spawning additional threads. Kept here for potentional architecture change in the future. 498 | */ 499 | /* 500 | * blk_status_t buse_flush(struct buse_cmd *cmd) 501 | * { 502 | * int i; 503 | * struct buse_queue *q = cmd->queue; 504 | * struct buse *buse = q->w.buse; 505 | * size_t num_queues = buse->num_queues; 506 | * struct buse_wqueue *wq; 507 | * size_t collision_areas = buse->size / buse->collision_area_size; 508 | * 509 | * atomic_set(&cmd->flush.queues_pending, num_queues); 510 | * 511 | * for (i = 0; i < num_queues; i++) { 512 | * wq = &buse->queues[i].w; 513 | * mutex_lock(&wq->lock); 514 | * } 515 | * 516 | * for (i = 0; i < num_queues; i++) { 517 | * wq = &buse->queues[i].w; 518 | * close_chunk(wq); 519 | * if (send_flush(wq, cmd) == -1) { 520 | * pr_debug("Cannot send flush packet from flusher!\n"); 521 | * cmd->canceled = true; 522 | * if (atomic_dec_and_test(&cmd->flush.queues_pending)) { 523 | * blk_mq_start_request(cmd->rq); 524 | * blk_mq_end_request(cmd->rq, BLK_STS_AGAIN); 525 | * } 526 | * break; 527 | * } 528 | * } 529 | * 530 | * memset(wq->buse->collision_counters, 0, collision_areas); 531 | * 532 | * for (i = 0; i < num_queues; i++) { 533 | * wq = &buse->queues[i].w; 534 | * mutex_unlock(&wq->lock); 535 | * } 536 | * 537 | * return BLK_STS_OK; 538 | * } 539 | */ 540 | 541 | /* 542 | * Drains all the queues because the is shutting down non-gracefully and we don't want memory leaks. 543 | */ 544 | static void wqueue_drain(struct buse_wqueue *wq) 545 | { 546 | struct write_chunk *chunk; 547 | 548 | mutex_lock(&wq->lock); 549 | close_chunk(wq); 550 | while (!list_empty(&wq->busy_chunks)) { 551 | chunk = list_first_entry(&wq->busy_chunks, struct write_chunk, list); 552 | mutex_unlock(&wq->lock); 553 | if (is_wqueue_term(chunk)) { 554 | mutex_lock(&wq->lock); 555 | list_del_init(&chunk->list); 556 | mutex_unlock(&wq->lock); 557 | kfree(chunk); 558 | } else 559 | ack_write_request(wq, chunk->shmem_offset, true); 560 | mutex_lock(&wq->lock); 561 | } 562 | 563 | while (!list_empty(&wq->fetched_chunks)) { 564 | chunk = list_first_entry(&wq->fetched_chunks, struct write_chunk, list); 565 | mutex_unlock(&wq->lock); 566 | ack_write_request(wq, chunk->shmem_offset, true); 567 | mutex_lock(&wq->lock); 568 | } 569 | mutex_unlock(&wq->lock); 570 | } 571 | 572 | /* 573 | * Deallocates the write queue. 574 | */ 575 | static void wqueue_exit(struct buse_wqueue *wq) 576 | { 577 | wqueue_drain(wq); 578 | kfree(wq->chunks); 579 | vfree(wq->shmem); 580 | } 581 | 582 | /* 583 | * Allocates the write queue. 584 | */ 585 | static int wqueue_init(struct buse_wqueue *wq) 586 | { 587 | int ret, i; 588 | struct buse *buse = wq->buse; 589 | uint w_chunks = buse->write_shm_size / buse->write_chunk_size; 590 | int numa_node = buse_get_numa_node_for_queue_id(wq->buse, wq->q->id); 591 | 592 | init_waitqueue_head(&wq->busy_chunks_avail); 593 | init_waitqueue_head(&wq->free_chunks_avail); 594 | INIT_LIST_HEAD(&wq->free_chunks); 595 | INIT_LIST_HEAD(&wq->busy_chunks); 596 | INIT_LIST_HEAD(&wq->fetched_chunks); 597 | 598 | mutex_init(&wq->lock); 599 | 600 | wq->size = buse->write_shm_size; 601 | 602 | wq->shmem = vmalloc_node(wq->size, numa_node); 603 | if (wq->shmem == NULL) { 604 | ret = -ENOMEM; 605 | goto err; 606 | } 607 | 608 | wq->chunks = kcalloc_node(w_chunks, sizeof(*wq->chunks), GFP_KERNEL, numa_node); 609 | if (!wq->chunks) { 610 | ret = -ENOMEM; 611 | goto err_shmem; 612 | } 613 | 614 | for (i = 0; i < w_chunks; i++) 615 | init_write_chunk(wq, &wq->chunks[i]); 616 | 617 | open_chunk(wq); 618 | 619 | return 0; 620 | 621 | err_shmem: 622 | vfree(wq->shmem); 623 | err: 624 | return ret; 625 | } 626 | 627 | /* 628 | * Init all write queues. 629 | */ 630 | int buse_wqueues_init(struct buse *buse) 631 | { 632 | int ret, i; 633 | struct buse_queue *q; 634 | size_t collisions_areas = buse->size / buse->collision_area_size; 635 | 636 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) { 637 | q->w.buse = buse; 638 | q->w.q = q; 639 | ret = wqueue_init(&q->w); 640 | if (ret) { 641 | i++; 642 | q++; 643 | goto err; 644 | } 645 | } 646 | 647 | buse->collision_counters = kcalloc(collisions_areas, sizeof(*buse->collision_counters), GFP_KERNEL); 648 | if (!buse->collision_counters) { 649 | ret = -ENOMEM; 650 | goto err; 651 | } 652 | 653 | return 0; 654 | 655 | err: 656 | for (i--, q--; i > 0; i--, q--) 657 | wqueue_exit(&q->w); 658 | 659 | return ret; 660 | } 661 | 662 | /* 663 | * Deinit all write queues. 664 | */ 665 | int buse_wqueues_exit(struct buse *buse) 666 | { 667 | int i; 668 | struct buse_queue *q; 669 | 670 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) 671 | wqueue_exit(&q->w); 672 | 673 | kfree(buse->collision_counters); 674 | 675 | return 0; 676 | } 677 | 678 | /* 679 | * Rerun all fetched chunks by the user space again. This is called when user space failes without 680 | * acknowledging write chunks and reconnects again. 681 | */ 682 | static void rerun_write_chunks(struct buse_wqueue *wq) 683 | { 684 | struct write_chunk *ch; 685 | 686 | mutex_lock(&wq->lock); 687 | while (!list_empty(&wq->fetched_chunks)) { 688 | ch = list_last_entry(&wq->fetched_chunks, struct write_chunk, list); 689 | list_del_init(&ch->list); 690 | list_add(&ch->list, &wq->busy_chunks); 691 | } 692 | wake_up(&wq->busy_chunks_avail); 693 | mutex_unlock(&wq->lock); 694 | } 695 | 696 | /* 697 | * Set the queue to be bound. 698 | */ 699 | void buse_wqueue_bind(struct buse_wqueue *wq) 700 | { 701 | atomic_set(&wq->bound, 1); 702 | buse_blkdev_init_cond(wq->buse); 703 | rerun_write_chunks(wq); 704 | } 705 | 706 | /* 707 | * Returns true if all write queues are bound. I.e. have connected the userspace counterpart. 708 | */ 709 | bool buse_wqueues_bound(struct buse *buse) 710 | { 711 | int i; 712 | struct buse_wqueue *wq; 713 | 714 | for (i = 0; i < buse->num_queues; i++) { 715 | wq = &buse->queues[i].w; 716 | if (atomic_read(&wq->bound) == 0) 717 | return false; 718 | } 719 | 720 | return true; 721 | } 722 | -------------------------------------------------------------------------------- /kernel/buse-wqueue.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_WQUEUE_H 4 | #define BUSE_WQUEUE_H 5 | 6 | #include 7 | #include "main.h" 8 | 9 | /* 10 | * When userspace acknowledge the write chunk we perform appropriate actions based on the write 11 | * chunk type. 12 | */ 13 | void ack_write_request(struct buse_wqueue *wq, u64 chunk_offset, bool draining); 14 | 15 | /* 16 | * Copies data to the active chunk and immediately acknowledge the write request. 17 | */ 18 | blk_status_t buse_write(struct buse_cmd *cmd); 19 | 20 | /* 21 | * Init all write queues. 22 | */ 23 | int buse_wqueues_init(struct buse *buse); 24 | 25 | /* 26 | * Deinit all write queues. 27 | */ 28 | int buse_wqueues_exit(struct buse *buse); 29 | 30 | /* 31 | * Flush operation. It broadcasts flush to all queues. 32 | */ 33 | blk_status_t buse_flush(struct buse_cmd *cmd); 34 | 35 | /* 36 | * Closes active chunk of the queue, i.e. no more writes can be written to the chunk and a new 37 | * chunks has to be opened. This usually means that flush happened or the chunk is full. 38 | */ 39 | int close_chunk(struct buse_wqueue *wq); 40 | 41 | bool is_flush_packet(struct write_chunk *wc); 42 | 43 | /* 44 | * Pulls write chunk from the busy queue and returns it. If there is no write chunk in the busy queue, 45 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the 46 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and 47 | * potential rerun fetched but not yet acknowledged chunks. 48 | */ 49 | struct write_chunk *pop_write_request_wait(struct buse_wqueue *wq); 50 | 51 | /* 52 | * Returns true if all write queues are bound. I.e. have connected the userspace counterpart. 53 | */ 54 | bool buse_wqueues_bound(struct buse *buse); 55 | 56 | /* 57 | * Set the queue to be bound. 58 | */ 59 | void buse_wqueue_bind(struct buse_wqueue *wq); 60 | 61 | /* 62 | * Sends termination chunk to the write queue. 63 | */ 64 | void wqueue_send_term(struct buse_wqueue *wq); 65 | 66 | /* 67 | * True if the chunk is termination chunk. 68 | */ 69 | bool is_wqueue_term(struct write_chunk *ch); 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /kernel/main.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */ 2 | 3 | #include 4 | #include 5 | 6 | #include "buse-blkdev.h" 7 | #include "buse-chrdev.h" 8 | #include "buse-configfs.h" 9 | #include "buse-rqueue.h" 10 | #include "buse-wqueue.h" 11 | #include "main.h" 12 | 13 | const char *buse_blkdev_name = "buse"; 14 | const int buse_blkdev_max_minors = 16; 15 | int buse_blkdev_major; 16 | struct class *buse_chrdev_class; 17 | 18 | /* 19 | * Add new buse device with index and sets default parameters. All parameters can be changed via configfs. 20 | */ 21 | struct buse *buse_add(uint index) 22 | { 23 | int ret; 24 | 25 | struct buse *buse = kzalloc(sizeof(*buse), GFP_KERNEL); 26 | if (!buse) { 27 | ret = -ENOMEM; 28 | goto err; 29 | } 30 | 31 | atomic_set(&buse->stopped, 1); 32 | 33 | mutex_init(&buse->configfs_mutex); 34 | buse->index = index; 35 | buse->size = SZ_1G; 36 | buse->block_size = 512; 37 | buse->io_min = buse->block_size; 38 | buse->io_opt = buse->block_size; 39 | buse->write_chunk_size = 2 * SZ_1M; 40 | buse->write_shm_size = 32 * SZ_1M; 41 | buse->read_shm_size = buse->write_shm_size; 42 | buse->queue_depth = 64; 43 | buse->no_scheduler = true; 44 | buse->can_secure_erase = false; 45 | buse->can_discard = false; 46 | buse->can_write_same = false; 47 | buse->can_write_zeroes = false; 48 | buse->hw_queues = 1; 49 | buse->collision_area_size = 4096; 50 | 51 | return buse; 52 | 53 | err: 54 | return ERR_PTR(ret); 55 | } 56 | 57 | /* 58 | * Checks whether all queues are connected and creates the block device eventually. 59 | */ 60 | void buse_blkdev_init_cond(struct buse *buse) 61 | { 62 | int ret; 63 | 64 | if (!buse_wqueues_bound(buse) || 65 | !buse_rqueues_bound(buse) || 66 | buse->blkdev.created) 67 | return; 68 | 69 | buse->blkdev.created = true; 70 | buse_gendisk_register(buse); 71 | return; 72 | 73 | ret = buse_blkdev_init(buse); 74 | if (ret) 75 | goto err; 76 | 77 | return; 78 | 79 | err: 80 | return; 81 | } 82 | 83 | /* 84 | * Initialize all structures for created device. 85 | */ 86 | int buse_on(struct buse *buse) 87 | { 88 | int ret; 89 | 90 | buse->queues = kcalloc(buse->hw_queues, sizeof(*buse->queues), GFP_KERNEL); 91 | if (!buse->queues) { 92 | ret = -ENOMEM; 93 | goto err; 94 | } 95 | 96 | ret = buse_blkdev_init(buse); 97 | if (ret) 98 | goto err_queues; 99 | 100 | ret = buse_chrdev_init(buse); 101 | if (ret) 102 | goto err_blk; 103 | 104 | ret = buse_rqueues_init(buse); 105 | if (ret) 106 | goto err_chr; 107 | 108 | ret = buse_wqueues_init(buse); 109 | if (ret) 110 | goto err_r_init; 111 | 112 | return 0; 113 | 114 | err_r_init: 115 | buse_rqueues_exit(buse); 116 | err_chr: 117 | buse_chrdev_exit(buse); 118 | err_blk: 119 | buse_blkdev_exit(buse); 120 | err_queues: 121 | kfree(buse->queues); 122 | buse->queues = NULL; 123 | err: 124 | return ret; 125 | } 126 | 127 | /* 128 | * Deletes all the structures needed by the device. 129 | */ 130 | int buse_off(struct buse *buse) 131 | { 132 | if (!buse->queues) 133 | return -EINVAL; 134 | 135 | if (buse_wqueues_bound(buse) || 136 | buse_rqueues_bound(buse)) 137 | return -EBUSY; 138 | 139 | buse_wqueues_exit(buse); 140 | buse_rqueues_exit(buse); 141 | buse_chrdev_exit(buse); 142 | buse_blkdev_exit(buse); 143 | kfree(buse->queues); 144 | buse->queues = NULL; 145 | 146 | return 0; 147 | } 148 | 149 | /* 150 | * Frees the buse structure. 151 | */ 152 | void buse_del(struct buse *buse) 153 | { 154 | kfree(buse); 155 | } 156 | 157 | /* 158 | * Sends the termination chunks to all queues signaling that the device is stopping. This is 159 | * reaction to writing 0 to the power configfs attribute. When the userspace disconnect all the 160 | * queues, it can call buse_off(). 161 | */ 162 | void buse_stop(struct buse *buse) 163 | { 164 | int i; 165 | struct buse_wqueue *wq; 166 | struct buse_rqueue *rq; 167 | 168 | if (!buse->queues) 169 | return; 170 | 171 | for (i = 0; i < buse->num_queues; i++) { 172 | wq = &buse->queues[i].w; 173 | wqueue_send_term(wq); 174 | } 175 | 176 | for (i = 0; i < buse->num_queues; i++) { 177 | rq = &buse->queues[i].r; 178 | rqueue_send_term(rq); 179 | } 180 | } 181 | 182 | /* 183 | * Kernel module init function which is ran when the module is loaded. It just registers majors for 184 | * block device and character devices and initialize configfs subsystem. All further operations are 185 | * triggered from configfs. 186 | */ 187 | static int __init buse_init(void) 188 | { 189 | int ret; 190 | 191 | buse_blkdev_major = register_blkdev(0, buse_blkdev_name); 192 | if (buse_blkdev_major < 0) { 193 | ret = buse_blkdev_major; 194 | goto err; 195 | } 196 | 197 | buse_chrdev_class = class_create(THIS_MODULE, buse_blkdev_name); 198 | if (IS_ERR(buse_chrdev_class)) { 199 | ret = PTR_ERR(buse_chrdev_class); 200 | goto err_blk; 201 | } 202 | 203 | ret = buse_configfs_init(); 204 | if (ret) 205 | goto err_class; 206 | 207 | return 0; 208 | 209 | err_class: 210 | class_destroy(buse_chrdev_class); 211 | err_blk: 212 | unregister_blkdev(buse_blkdev_major, buse_blkdev_name); 213 | err: 214 | return ret; 215 | } 216 | 217 | /* 218 | * Kernel module exit function. Cleanup all module related structures. Module can be unloaded only 219 | * if all devices are destroyed. 220 | */ 221 | static void __exit buse_exit(void) 222 | { 223 | class_destroy(buse_chrdev_class); 224 | unregister_blkdev(buse_blkdev_major, buse_blkdev_name); 225 | 226 | buse_configfs_exit(); 227 | } 228 | 229 | module_init(buse_init); 230 | module_exit(buse_exit); 231 | 232 | MODULE_LICENSE("GPL"); 233 | MODULE_AUTHOR("Vojtech Aschenbrenner "); 234 | MODULE_DESCRIPTION("BUSE"); 235 | MODULE_VERSION("0.0.1"); 236 | -------------------------------------------------------------------------------- /kernel/main.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */ 2 | 3 | #ifndef BUSE_MAIN_H 4 | #define BUSE_MAIN_H 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #define BUSE_MAGIC 0xB3 11 | 12 | extern const char *buse_blkdev_name; 13 | extern const int buse_blkdev_max_minors; 14 | extern int buse_blkdev_major; 15 | extern struct class *buse_chrdev_class; 16 | 17 | /* 18 | * Per block device structure containing all necessary fields for creating mq block device. 19 | */ 20 | struct buse_blkdev 21 | { 22 | struct blk_mq_tag_set tag_set; 23 | struct gendisk *disk; 24 | struct request_queue *request_queue; 25 | 26 | /* Flag which is set once the device is created. This is important 27 | * because we don't create device immediately but wait until all 28 | * control queues are connected. Hence it is important to keep track of 29 | * it to know whether to destroy the block device during shut down. 30 | */ 31 | bool created; 32 | }; 33 | 34 | /* 35 | * Global module structure. 36 | */ 37 | struct buse 38 | { 39 | /* Configfs related fields. */ 40 | struct config_item item; 41 | struct mutex configfs_mutex; 42 | 43 | /* Indicator that device was stopped. All further io requests are refused. */ 44 | atomic_t stopped; 45 | 46 | /* Block device related structure. */ 47 | struct buse_blkdev blkdev; 48 | 49 | /* Sequential numbers for writes. We define one counter per collision 50 | * domain to avoid excessive cache coherency protocol traffic. This 51 | * creates ordering on all writes inside the collision domain which is 52 | * enough. A counter per sector would be optimal, but memory 53 | * inefficient. One counter per whole address space would be to 54 | * contended. Collision domains are a good compromise. 55 | */ 56 | atomic_t *collision_counters; 57 | 58 | /* Individual queues structure related to the created character 59 | * devices. */ 60 | struct buse_queue *queues; 61 | int num_queues; 62 | 63 | /* Attributes set by configfs operations. */ 64 | 65 | /* Setting to 1 powers on the device and queues can be bound. */ 66 | bool power; 67 | 68 | /* Index of the created block device corresponding to the created 69 | * configfs node with mkdir. */ 70 | u64 index; 71 | 72 | /* Size of the device in bytes. */ 73 | u64 size; 74 | 75 | /* Block size. This should be 512 or 4096. */ 76 | u64 block_size; 77 | 78 | /* Minimal IO size. Has to be >= block_size and a power of 2. */ 79 | u64 io_min; 80 | 81 | /* Optimal IO size. Has to be >= block_size and a power of 2. */ 82 | u64 io_opt; 83 | 84 | /* Max size of one write chunk which is passed to the userspace. */ 85 | u64 write_chunk_size; 86 | 87 | /* Size of the shared memory between kernel and userspace which is used 88 | * for sending write chunks to the userspace. This is per one write 89 | * queue. 90 | */ 91 | u64 write_shm_size; 92 | 93 | /* Size of the shared memory between kernel and userspace which is used 94 | * for sending individual reads to the userspace. This is per one write 95 | * queue. Compared to writes reads are not batched into chunks. Each 96 | * individual read is sent to userspace. 97 | */ 98 | u64 read_shm_size; 99 | 100 | /* Queue depth of the created block device. */ 101 | u64 queue_depth; 102 | 103 | /* Number of hw queues block device provides. Usually number of CPUs is 104 | * the right value. */ 105 | u64 hw_queues; 106 | 107 | /* Size of the area sharing the space of write sequential numbers. */ 108 | u64 collision_area_size; 109 | 110 | /* Instructs blk-mq no to use scheduler on the queues. */ 111 | bool no_scheduler; 112 | 113 | /* For future usage. */ 114 | bool can_secure_erase; 115 | bool can_write_same; 116 | bool can_write_zeroes; 117 | bool can_discard; 118 | }; 119 | 120 | /* 121 | * Per character device structure. Character device represents a queue in our model. 122 | */ 123 | struct buse_chrdev 124 | { 125 | struct cdev cdev; 126 | struct device *dev; 127 | dev_t region; 128 | }; 129 | 130 | /* 131 | * Read queue structure. 132 | */ 133 | struct buse_rqueue 134 | { 135 | /* Pointer to the main buse structure. */ 136 | struct buse *buse; 137 | 138 | /* Pointer to the corresponding struct queue */ 139 | struct buse_queue *q; 140 | 141 | /* Character device corresponding to the read queue. */ 142 | struct buse_chrdev chrdev; 143 | 144 | /* Shared memory area between kernel and user space. */ 145 | void *shmem; 146 | size_t size; 147 | 148 | /* Flag whether individual queue is bound, i.e. the character device is 149 | * opened and mmaped. */ 150 | atomic_t bound; 151 | 152 | /* Mapping from the bitmap index to the read chunk. Used when bitmap 153 | * index is acknowledged to know what read to acknowledge. 154 | */ 155 | struct read_chunk **chunk_from_bitmap; 156 | 157 | /* Waitqueue on the event when no busy chunk is available, i.e. there 158 | * is nothing to send to the userspace. 159 | */ 160 | wait_queue_head_t busy_chunks_avail; 161 | 162 | /* Waitqueue on the event when no free chunk is available, i.e. there 163 | * is no space to process additional reads. 164 | */ 165 | wait_queue_head_t free_chunks_avail; 166 | 167 | /* Lock per the whole read queue. */ 168 | struct mutex lock; 169 | 170 | /* Bitmap for keeping track of free space in shared memory. */ 171 | unsigned long *free_chunks_bitmap; 172 | 173 | /* Queue with chunks ready to be sent to user space. */ 174 | struct list_head busy_chunks; 175 | 176 | /* Queue with chunks already sent to user space. Important when user 177 | * space side crashes to rerun not acknowledged but fetched reads 178 | * again. 179 | */ 180 | struct list_head fetched_chunks; 181 | 182 | /* If true the termination chunk was already sent to user space and no 183 | * other chunk can be processed by the other end of the queue. 184 | */ 185 | bool terminated; 186 | }; 187 | 188 | /* 189 | * Description of individual write in the metadata part of the chunk. 190 | */ 191 | struct writelist_item 192 | { 193 | /* First written sector. */ 194 | size_t sector; 195 | 196 | /* Length of the write in sectors. */ 197 | size_t len; 198 | 199 | /* Sequential number of write. */ 200 | size_t id; 201 | 202 | /* Reserved for future usage. */ 203 | size_t flag; 204 | }; 205 | 206 | /* 207 | * Write chunk is the unit sent to the user space. It containes batched writes and is split into two 208 | * parts. Metadata part containes information about the writes and data part contains their data. 209 | */ 210 | struct write_chunk 211 | { 212 | /* Chunk can be part of list. */ 213 | struct list_head list; 214 | 215 | /* Offset to the shared memory where the chunk starts. */ 216 | u64 shmem_offset; 217 | 218 | /* Number of writes batched in the chunk. */ 219 | u64 num_writes; 220 | 221 | /* Helper pointer to keep track where next write of data should go. */ 222 | void *data_frontier; 223 | 224 | /* Helper pointer to keep track where next write of metadata should go. */ 225 | struct writelist_item *writelist_frontier; 226 | 227 | /* List of all reads waiting for any write in the chunk. These reads 228 | * are postponed and woken up when the write is acknowledged. Solution 229 | * of the read after write hazard. 230 | */ 231 | struct list_head dependent_reads; 232 | 233 | /* If the chunks is flush chunk, i.e. just performing the flush 234 | * operation, we store the cmd pointer here to be able to acknowledge 235 | * it easily. 236 | */ 237 | struct buse_cmd *cmd; 238 | }; 239 | 240 | /* 241 | * Read chunk is the unit sent to the user space. In contrast to write chunk it has variable length 242 | * and corresponds to exactly one read request. 243 | */ 244 | struct read_chunk 245 | { 246 | /* Part of the list. */ 247 | struct list_head list; 248 | 249 | /* First sector of the read. */ 250 | size_t sector; 251 | 252 | /* Length of the read in sectors. */ 253 | size_t len; 254 | 255 | /* Offset in the shared memory where the chunk starts. */ 256 | size_t shmem_offset; 257 | 258 | /* Pointer to cmd which has to acknowledged when this chunk is acknowledged. */ 259 | struct buse_cmd *cmd; 260 | }; 261 | 262 | /* 263 | * Write queue structure. 264 | */ 265 | struct buse_wqueue 266 | { 267 | /* Pointer to the main buse structure. */ 268 | struct buse *buse; 269 | 270 | /* Pointer to the corresponding struct queue */ 271 | struct buse_queue *q; 272 | 273 | /* Character device corresponding to the read queue. */ 274 | struct buse_chrdev chrdev; 275 | 276 | /* Shared memory area between kernel and user space. */ 277 | void *shmem; 278 | size_t size; 279 | 280 | /* Flag whether individual queue is bound, i.e. the character device is 281 | * opened and mmaped. 282 | */ 283 | atomic_t bound; 284 | 285 | /* Waitqueue on the event when no busy chunk is available, i.e. there 286 | * is nothing to send to the userspace. 287 | */ 288 | wait_queue_head_t busy_chunks_avail; 289 | 290 | /* Waitqueue on the event when no free chunk is available, i.e. there 291 | * is no space to process additional reads. 292 | */ 293 | wait_queue_head_t free_chunks_avail; 294 | 295 | /* Array of all write chunks. */ 296 | struct write_chunk *chunks; 297 | 298 | /* Lock per the whole write queue. */ 299 | struct mutex lock; 300 | 301 | 302 | /* Queue keeping track of free write chunks. */ 303 | struct list_head free_chunks; 304 | 305 | /* Queue with chunks ready to be sent to user space. */ 306 | struct list_head busy_chunks; 307 | 308 | /* Queue with chunks already sent to user space. Important when user 309 | * space side crashes to rerun not acknowledged but fetched writes 310 | * again. 311 | */ 312 | struct list_head fetched_chunks; 313 | 314 | /* Currently active chunk in the individual queue. All writes are going to this chunk. */ 315 | struct write_chunk *active_chunk; 316 | 317 | /* If true the termination chunk was already sent to user space and no 318 | * other chunk can be processed by the other end of the queue. 319 | */ 320 | bool terminated; 321 | }; 322 | 323 | /* 324 | * Putting read and write queues together and assign them id. Just for convenience and easier 325 | * debugging. 326 | */ 327 | struct buse_queue 328 | { 329 | struct buse_rqueue r; 330 | struct buse_wqueue w; 331 | size_t id; 332 | }; 333 | 334 | /* 335 | * Request extension to be insertable to the list. 336 | */ 337 | struct rq_node 338 | { 339 | struct list_head list; 340 | struct request *rq; 341 | }; 342 | 343 | /* 344 | * Custom cmd which is allocated for each cmd comming from the blk-mq queue. It contains 345 | */ 346 | struct buse_cmd 347 | { 348 | /* Magic number to be more sure we read the right memory. */ 349 | u8 magic; 350 | 351 | /* Corresponding request to the cmd. */ 352 | struct request *rq; 353 | 354 | /* Queue where the request arrived. */ 355 | struct buse_queue *queue; 356 | 357 | /* True if some operation failed and at the end the cmd should be 358 | * canceled and report it to the blk-mq. 359 | */ 360 | bool canceled; 361 | 362 | /* Helper fields for different types of commands. */ 363 | union { 364 | struct { 365 | /* How many more queues need to do their check for read 366 | * after write hazard. 367 | */ 368 | atomic_t queues_pending; 369 | 370 | /* How many writes need to be acknowledged until the 371 | * read can be send to user space. 372 | */ 373 | atomic_t write_deps; 374 | } read; 375 | 376 | struct { 377 | /* How many more queues need to send the flush chunk. 378 | * This is used when broadcasting flush command. 379 | */ 380 | atomic_t queues_pending; 381 | } flush; 382 | }; 383 | }; 384 | 385 | /* 386 | * Helper for passing arguments when creating new thread. 387 | */ 388 | struct cmd_q_args 389 | { 390 | struct buse_cmd *cmd; 391 | struct buse_queue *q; 392 | }; 393 | 394 | /* Adds new buse device with given index. */ 395 | struct buse *buse_add(uint index); 396 | 397 | /* Delete buse device. */ 398 | void buse_del(struct buse *buse); 399 | 400 | /* Turns on buse. */ 401 | int buse_on(struct buse *buse); 402 | 403 | /* Turns off buse. Cannot be started again. */ 404 | int buse_off(struct buse *buse); 405 | 406 | /* Stops buse. No io requests are accepted but can be started again. */ 407 | void buse_stop(struct buse *buse); 408 | 409 | /* Checks if all queues are connected and if they are it creates the block 410 | * device and is ready to serve io commands. 411 | */ 412 | void buse_blkdev_init_cond(struct buse *buse); 413 | 414 | #endif 415 | -------------------------------------------------------------------------------- /lib/go/buse/buse.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2022 Vojtech Aschenbrenner 2 | 3 | package buse 4 | 5 | import ( 6 | "encoding/binary" 7 | "errors" 8 | "fmt" 9 | "io/ioutil" 10 | "os" 11 | "runtime" 12 | "sync" 13 | "syscall" 14 | 15 | "golang.org/x/sys/unix" 16 | ) 17 | 18 | const ( 19 | // Character device for buse device %d and read queue %d. 20 | buseReadPathFmt = "/dev/buse%d-r%d" 21 | 22 | // Character device for buse device %d and write queue %d. 23 | buseWritePathFmt = "/dev/buse%d-w%d" 24 | 25 | // Path to the configfs directory. 26 | configFsPath = "/sys/kernel/config/buse" 27 | 28 | // Size of write request in write queue. 29 | writeRequestSize = 16 30 | 31 | // Size of read request in read queue. 32 | readRequestSize = 24 33 | ) 34 | 35 | // Provides functions which are called by buse as a reaction to the received 36 | // command. 37 | type BuseReadWriter interface { 38 | // BuseRead should read the extent starting at the given sector with 39 | // the given length. The read data should be written to the provided 40 | // sice. The chunk is guaranteed to have sufficient capacity to hold 41 | // the data. 42 | // 43 | // This method is called by the BUSE library in response to a read 44 | // request received from the kernel driver. 45 | BuseRead(sector, length int64, chunk []byte) error 46 | 47 | // BuseWrite should handle all writes stored in the given chunk. The 48 | // first argument holds the number of writes in the chunk. 49 | // 50 | // This method is called by the BUSE library in response to a write 51 | // or flush request received from the kernel driver. 52 | BuseWrite(writes int64, chunk []byte) error 53 | 54 | // BusePreRun is called immediately before the device is started. 55 | BusePreRun() 56 | 57 | // BusePostRemove is called after the device is removed. 58 | BusePostRemove() 59 | } 60 | 61 | // Options for created buse device. 62 | type Options struct { 63 | Durable bool 64 | WriteChunkSize int64 65 | BlockSize int64 66 | IOMin int64 67 | IOOpt int64 68 | Threads int 69 | Major int64 70 | WriteShmSize int64 71 | ReadShmSize int64 72 | Size int64 73 | CollisionArea int64 74 | QueueDepth int64 75 | Scheduler bool 76 | CPUsPerNode int 77 | } 78 | 79 | // Buse is a library wrapping the low level interaction with buse kernel module 80 | // and provides simple API to for creating a block device in user space. 81 | type Buse struct { 82 | ReadWriter BuseReadWriter 83 | Options Options 84 | } 85 | 86 | // Returns new instance of Buse configured with options o. 87 | func New(rw BuseReadWriter, o Options) (Buse, error) { 88 | buse := Buse{ 89 | ReadWriter: rw, 90 | Options: o, 91 | } 92 | 93 | err := buse.checkOptions() 94 | if err != nil { 95 | return Buse{}, err 96 | } 97 | 98 | err = buse.configure() 99 | if err != nil { 100 | return Buse{}, err 101 | } 102 | 103 | return buse, nil 104 | } 105 | 106 | // Returns total memory presented to the system. 107 | func totalMemory() (uint64, error) { 108 | sysInfo := &syscall.Sysinfo_t{} 109 | 110 | if err := syscall.Sysinfo(sysInfo); err != nil { 111 | return 0, err 112 | } 113 | 114 | // On 32-bit architectures the result is uint, hence we need to type it 115 | // to uint64 to conform with function signature. 116 | totalMemory := uint64(sysInfo.Totalram) * uint64(sysInfo.Unit) 117 | 118 | return totalMemory, nil 119 | } 120 | 121 | // Validates passed options. 122 | func (b *Buse) checkOptions() error { 123 | o := &b.Options 124 | 125 | if o.Threads == 0 || o.Threads > runtime.NumCPU() { 126 | o.Threads = runtime.NumCPU() 127 | } 128 | 129 | if o.CPUsPerNode == 0 || o.CPUsPerNode > runtime.NumCPU() { 130 | o.CPUsPerNode = runtime.NumCPU() 131 | } 132 | 133 | if o.IOMin == 0 { 134 | o.IOMin = o.BlockSize 135 | } 136 | 137 | if o.IOOpt == 0 { 138 | o.IOOpt = o.BlockSize 139 | } 140 | 141 | totalMem, err := totalMemory() 142 | if err != nil { 143 | return errors.New("Cannot read total amount of ram!") 144 | } 145 | 146 | neededMemory := uint64(o.Threads) * uint64(o.WriteShmSize+o.ReadShmSize) 147 | if neededMemory > totalMem { 148 | return errors.New("Not enough memory!") 149 | } 150 | 151 | if o.WriteShmSize%o.WriteChunkSize != 0 { 152 | return errors.New("Write buffer size has to be a multiple of chunk size!") 153 | } 154 | 155 | if o.BlockSize != 512 && o.BlockSize != 4096 { 156 | return errors.New("Block size has to 512 or 4096!") 157 | } 158 | 159 | if o.IOMin < o.BlockSize || o.IOMin%2 != 0 { 160 | return errors.New("Minimal IO has to be at least a block size and a power of 2!") 161 | } 162 | 163 | if o.IOOpt < o.BlockSize || o.IOOpt%2 != 0 { 164 | return errors.New("Optimal IO has to be at least a block size and a power of 2!") 165 | } 166 | 167 | return nil 168 | } 169 | 170 | // Performs configuration of the block device which is just being created. It 171 | // configures buse device via configs according to the options passed to the 172 | // New() function. When configuration succeed the device is power on. 173 | func (b *Buse) configure() error { 174 | var noScheduler int64 175 | if !b.Options.Scheduler { 176 | noScheduler = 1 177 | } 178 | 179 | configFsPath := fmt.Sprint(configFsPath, "/", b.Options.Major) 180 | if _, err := os.Stat(configFsPath); !os.IsNotExist(err) { 181 | return errors.New(fmt.Sprintf("Device buse%d already exists!", b.Options.Major)) 182 | } 183 | 184 | if err := os.Mkdir(configFsPath, 0755); err != nil { 185 | return err 186 | } 187 | 188 | kernelParams := map[string]int64{ 189 | "size": b.Options.Size, 190 | "collision_area_size": int64(b.Options.CollisionArea), 191 | "read_shm_size": int64(b.Options.ReadShmSize), 192 | "write_shm_size": int64(b.Options.WriteShmSize), 193 | "write_chunk_size": int64(b.Options.WriteChunkSize), 194 | "hw_queues": int64(b.Options.Threads), 195 | "blocksize": int64(b.Options.BlockSize), 196 | "io_min": int64(b.Options.IOMin), 197 | "io_opt": int64(b.Options.IOOpt), 198 | "queue_depth": int64(b.Options.QueueDepth), 199 | "no_scheduler": noScheduler, 200 | } 201 | 202 | for variable, value := range kernelParams { 203 | if err := b.setConfig(variable, value); err != nil { 204 | return err 205 | } 206 | } 207 | 208 | if err := b.setConfig("power", 1); err != nil { 209 | return err 210 | } 211 | 212 | return nil 213 | } 214 | 215 | // Opens control file and mmap it. Returns file and mmapped memory. 216 | func openAndMmapControlFile(chardev string, shm_size int) (*os.File, []byte, error) { 217 | f, err := os.OpenFile(chardev, os.O_RDWR, 0644) 218 | if err != nil { 219 | return nil, nil, err 220 | } 221 | 222 | shmem, err := syscall.Mmap(int(f.Fd()), 0, shm_size, 223 | syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) 224 | if err != nil { 225 | f.Close() 226 | return nil, nil, err 227 | } 228 | 229 | return f, shmem, err 230 | } 231 | 232 | // Parses request reading from write queue character device. 233 | func (b *Buse) parseWriteRequest(request []byte) ([]byte, uint64, uint64) { 234 | raw := make([]byte, 8) 235 | copy(raw, request[:8]) 236 | offset := binary.LittleEndian.Uint64(raw) 237 | writesLen := binary.LittleEndian.Uint64(request[8:16]) 238 | 239 | return raw, offset, writesLen 240 | } 241 | 242 | // Parses request reading from read queue character device. 243 | func (b *Buse) parseReadRequest(request []byte) ([]byte, uint64, uint64, uint64) { 244 | raw := make([]byte, 8) 245 | copy(raw, request[16:24]) 246 | offset := binary.LittleEndian.Uint64(raw) 247 | 248 | sector := binary.LittleEndian.Uint64(request[:8]) * 512 / uint64(b.Options.BlockSize) 249 | length := binary.LittleEndian.Uint64(request[8:16]) * 512 / uint64(b.Options.BlockSize) 250 | 251 | return raw, offset, sector, length 252 | } 253 | 254 | // True if the request means termination of the device. 255 | func isTermination(offset uint64) bool { 256 | return offset == ^uint64(0) 257 | } 258 | 259 | // True if the request is flush. 260 | func isFlush(offset uint64) bool { 261 | return offset > (1 << 32) 262 | } 263 | 264 | func (b *Buse) bindToLocalNumaNode(cpuId int) { 265 | localNode := cpuId / b.Options.CPUsPerNode 266 | firstCpu := localNode * b.Options.CPUsPerNode 267 | lastCpu := firstCpu + b.Options.CPUsPerNode - 1 268 | 269 | cpuSet := unix.CPUSet{} 270 | cpuSet.Zero() 271 | 272 | for c := firstCpu; c <= lastCpu; c++ { 273 | cpuSet.Set(c) 274 | } 275 | 276 | unix.SchedSetaffinity(0, &cpuSet) 277 | } 278 | 279 | // Infinite loop reading from write queue character device and calling 280 | // BuseWrite() callback provided by calling application. When the BuseWrite() 281 | // returns then the batched write is confirmed to the kernel leading to the 282 | // recycling of the buffer in shared memory. 283 | func (b *Buse) writer(chardev string, wgFunc *sync.WaitGroup, shm_size int) { 284 | defer wgFunc.Done() 285 | 286 | var major, cpuId int 287 | fmt.Sscanf(chardev, buseWritePathFmt, &major, &cpuId) 288 | b.bindToLocalNumaNode(cpuId) 289 | 290 | controlFile, shmem, err := openAndMmapControlFile(chardev, shm_size) 291 | if err != nil { 292 | panic(err) 293 | } 294 | defer controlFile.Close() 295 | defer syscall.Munmap(shmem) 296 | 297 | requestBuffer := make([]byte, writeRequestSize) 298 | wg := sync.WaitGroup{} 299 | for { 300 | _, err := controlFile.Read(requestBuffer) 301 | if err != nil { 302 | continue 303 | } 304 | 305 | offsetRaw, offset, writesLen := b.parseWriteRequest(requestBuffer) 306 | 307 | if isTermination(offset) { 308 | wg.Wait() 309 | return 310 | } 311 | 312 | if isFlush(offset) { 313 | if b.Options.Durable { 314 | wg.Wait() 315 | } 316 | controlFile.Write(offsetRaw) 317 | continue 318 | } 319 | 320 | dataRegion := shmem[offset : offset+uint64(b.Options.WriteChunkSize)] 321 | wg.Add(1) 322 | go func() { 323 | defer wg.Done() 324 | 325 | err := b.ReadWriter.BuseWrite(int64(writesLen), dataRegion) 326 | if err != nil { 327 | fmt.Fprintf(os.Stderr, "Chunk write (%d writes) failed!\n", writesLen) 328 | fmt.Fprint(os.Stderr, err) 329 | } 330 | 331 | n, err := controlFile.Write(offsetRaw) 332 | if err != nil { 333 | fmt.Fprint(os.Stderr, "Read ack error, n =", n, "err=", err.Error()) 334 | fmt.Fprint(os.Stderr, err) 335 | } 336 | }() 337 | } 338 | } 339 | 340 | // Infinite loop reading from read queue character device and calling 341 | // BuseRead() callback provided by calling application. When the BuseRead() 342 | // returns then the read request is acknowledged to the kernel. 343 | func (b *Buse) reader(chardev string, wgFunc *sync.WaitGroup, shm_size int) { 344 | defer wgFunc.Done() 345 | 346 | var major, cpuId int 347 | fmt.Sscanf(chardev, buseReadPathFmt, &major, &cpuId) 348 | b.bindToLocalNumaNode(cpuId) 349 | 350 | controlFile, shmem, err := openAndMmapControlFile(chardev, shm_size) 351 | if err != nil { 352 | panic(err) 353 | } 354 | defer controlFile.Close() 355 | defer syscall.Munmap(shmem) 356 | 357 | requestBuffer := make([]byte, readRequestSize) 358 | var wg sync.WaitGroup 359 | for { 360 | _, err := controlFile.Read(requestBuffer) 361 | if err != nil { 362 | continue 363 | } 364 | 365 | offsetRaw, offset, sector, length := b.parseReadRequest(requestBuffer) 366 | 367 | if isTermination(offset) { 368 | wg.Wait() 369 | return 370 | } 371 | 372 | size := int64(length) * b.Options.BlockSize 373 | dataRegion := shmem[int64(offset) : int64(offset)+size] 374 | 375 | wg.Add(1) 376 | go func() { 377 | defer wg.Done() 378 | 379 | err := b.ReadWriter.BuseRead(int64(sector), int64(length), dataRegion) 380 | if err != nil { 381 | fmt.Fprint(os.Stderr, err) 382 | } 383 | 384 | _, err = controlFile.Write(offsetRaw) 385 | if err != nil { 386 | fmt.Fprint(os.Stderr, err) 387 | } 388 | }() 389 | } 390 | } 391 | 392 | // Bind all the control queues and start processing read and write commands. 393 | // This is done via multiple readers and writers. One worker per queue. 394 | func (b *Buse) Run() { 395 | b.ReadWriter.BusePreRun() 396 | 397 | var wg sync.WaitGroup 398 | wg.Add(int(b.Options.Threads) * 2) 399 | for i := 0; i < int(b.Options.Threads); i++ { 400 | w := fmt.Sprintf(buseWritePathFmt, b.Options.Major, i) 401 | r := fmt.Sprintf(buseReadPathFmt, b.Options.Major, i) 402 | 403 | go b.writer(w, &wg, int(b.Options.WriteShmSize)) 404 | go b.reader(r, &wg, int(b.Options.ReadShmSize)) 405 | } 406 | wg.Wait() 407 | } 408 | 409 | // Write value to configfs variable. 410 | func (b *Buse) setConfig(variable string, value int64) error { 411 | configFsPath := fmt.Sprint(configFsPath, "/", b.Options.Major, "/", variable) 412 | byteValue := []byte(fmt.Sprint(value)) 413 | 414 | err := ioutil.WriteFile(configFsPath, byteValue, 0644) 415 | 416 | return err 417 | } 418 | 419 | // Stop buse device. All requests are refused but the device is still visible 420 | // and can be started again. 421 | func (b *Buse) StopDevice() error { 422 | err := b.setConfig("power", 0) 423 | return err 424 | } 425 | 426 | // Remove the device. The device is unregistered as block device. 427 | func (b *Buse) RemoveDevice() error { 428 | err := syscall.Rmdir(fmt.Sprint(configFsPath, "/", b.Options.Major)) 429 | b.ReadWriter.BusePostRemove() 430 | return err 431 | } 432 | -------------------------------------------------------------------------------- /lib/go/buse/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/asch/buse/lib/go/buse 2 | 3 | go 1.16 4 | --------------------------------------------------------------------------------