├── README.md
├── buse.png
├── kernel
├── Makefile
├── buse-blkdev.c
├── buse-blkdev.h
├── buse-chrdev.c
├── buse-chrdev.h
├── buse-configfs.c
├── buse-configfs.h
├── buse-rqueue.c
├── buse-rqueue.h
├── buse-wqueue.c
├── buse-wqueue.h
├── main.c
└── main.h
└── lib
└── go
└── buse
├── buse.go
└── go.mod
/README.md:
--------------------------------------------------------------------------------
1 | # BUSE: Block Device in Userspace
2 |
3 | ## Write performance comparison
4 |
5 |
6 |
7 | ## Requirements
8 |
9 | * GNU Make
10 | * Linux Kernel 5.11 or newer
11 | * Linux Kernel Headers
12 |
13 | ## Installation
14 |
15 | ```
16 | cd kernel
17 | make
18 | sudo make install
19 | sudo modprobe buse
20 | ```
21 |
--------------------------------------------------------------------------------
/buse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asch/buse/f12ccb1d15a93539b0df1119b3ce2055ba51f50b/buse.png
--------------------------------------------------------------------------------
/kernel/Makefile:
--------------------------------------------------------------------------------
1 | obj-m += buse.o
2 | buse-objs := \
3 | main.o \
4 | buse-blkdev.o \
5 | buse-chrdev.o \
6 | buse-wqueue.o \
7 | buse-rqueue.o \
8 | buse-configfs.o
9 |
10 | MODULEDIR := /lib/modules/$(shell uname -r)
11 | KERNELDIR := $(MODULEDIR)/build
12 |
13 | SOURCES := $(wildcard *.c)
14 | HEADERS := $(wildcard *.h)
15 |
16 | #CC += -DDEBUG
17 |
18 | build: buse.ko
19 |
20 | buse.ko: $(SOURCES) $(HEADERS)
21 | make -C $(KERNELDIR) M=$(shell pwd) modules
22 |
23 | install: buse.ko
24 | install -D -m 644 $(shell pwd)/$< $(MODULEDIR)/extra/$<
25 | strip --strip-debug $(MODULEDIR)/extra/$<
26 | depmod
27 |
28 | clean:
29 | make -C $(KERNELDIR) M=$(shell pwd) clean
30 |
--------------------------------------------------------------------------------
/kernel/buse-blkdev.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "buse-blkdev.h"
10 | #include "buse-rqueue.h"
11 | #include "buse-wqueue.h"
12 | #include "main.h"
13 |
14 | /*
15 | * Init function called for every queue of created device. We just fill user data and compute the
16 | * queue id.
17 | */
18 | static int buse_init_hctx(struct blk_mq_hw_ctx *hw_ctx, void *driver_data, unsigned int hw_ctx_id)
19 | {
20 | struct buse *buse = hw_ctx->queue->queuedata;
21 | struct buse_queue *q = buse->queues;
22 |
23 | q[hw_ctx_id].id = hw_ctx_id;
24 | hw_ctx->driver_data = &q[hw_ctx_id];
25 | buse->num_queues++;
26 |
27 | return 0;
28 | }
29 |
30 | /*
31 | * io request callback called for every io in the queue. This function is called by blk-mq.
32 | */
33 | static blk_status_t buse_queue_rq(struct blk_mq_hw_ctx *hw_ctx, const struct blk_mq_queue_data *data)
34 | {
35 | struct request *r = data->rq;
36 | struct buse_cmd *cmd = blk_mq_rq_to_pdu(r);
37 | struct buse *buse;
38 |
39 | cmd->rq = r;
40 | cmd->queue = hw_ctx->driver_data;
41 | cmd->canceled = false;
42 | cmd->magic = BUSE_MAGIC;
43 |
44 | buse = cmd->queue->r.buse;
45 | if (atomic_read(&buse->stopped) == 1)
46 | return BLK_STS_IOERR;
47 |
48 | switch (req_op(r)) {
49 | case REQ_OP_DISCARD:
50 | case REQ_OP_WRITE_SAME:
51 | case REQ_OP_WRITE_ZEROES:
52 | case REQ_OP_SECURE_ERASE:
53 | case REQ_OP_WRITE:
54 | return buse_write(cmd);
55 | case REQ_OP_FLUSH:
56 | return buse_flush(cmd);
57 | case REQ_OP_READ:
58 | return buse_read(cmd);
59 | }
60 |
61 | pr_warn("Unsupported request no. %d\n", req_op(r));
62 |
63 | return BLK_STS_IOERR;
64 | }
65 |
66 | static const struct block_device_operations buse_blkdev_ops = {
67 | .owner = THIS_MODULE,
68 | };
69 |
70 | /*
71 | * When io request times out we just print warning to the dmesg a give it another chance. This is
72 | * the best we can do. If the device is eventually stopped, these requests will be canceled.
73 | */
74 | static enum blk_eh_timer_return buse_timeout(struct request *rq, bool b)
75 | {
76 | pr_warn("Request timed out! Is userspace connected? (rq = %p)\n", rq);
77 |
78 | return BLK_EH_RESET_TIMER;
79 | }
80 |
81 | /*
82 | * Control structure for blk-mq operations.
83 | */
84 | static const struct blk_mq_ops buse_mq_ops = {
85 | .init_hctx = buse_init_hctx,
86 | .queue_rq = buse_queue_rq,
87 | .timeout = buse_timeout,
88 | };
89 |
90 | /*
91 | * blk-mq tags initialization.
92 | */
93 | static void buse_set_tag_set(struct buse *buse)
94 | {
95 | struct blk_mq_tag_set *tag_set = &buse->blkdev.tag_set;
96 |
97 | tag_set->cmd_size = sizeof(struct buse_cmd);
98 | tag_set->driver_data = buse;
99 | tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
100 | if (buse->no_scheduler)
101 | tag_set->flags |= BLK_MQ_F_NO_SCHED;
102 | tag_set->nr_hw_queues = buse->hw_queues;
103 | tag_set->numa_node = NUMA_NO_NODE;
104 | tag_set->ops = &buse_mq_ops;
105 | tag_set->queue_depth = buse->queue_depth;
106 | }
107 |
108 | /*
109 | * Block device initialization. All configuration parameters are set according to the configured
110 | * values in struct buse. This is only related to the block device side of the module.
111 | */
112 | int buse_blkdev_init(struct buse *buse)
113 | {
114 | int ret;
115 | struct buse_blkdev *blkdev = &buse->blkdev;
116 | struct blk_mq_tag_set *tag_set = &blkdev->tag_set;
117 | size_t max_writes = buse->write_chunk_size / buse->block_size;
118 | size_t writelist_size = max_writes * sizeof(struct writelist_item);
119 | unsigned int max_hw_sectors;
120 |
121 | blkdev->disk = alloc_disk_node(1, NUMA_NO_NODE);
122 | if (!blkdev->disk) {
123 | ret = -ENOMEM;
124 | goto err;
125 | }
126 |
127 | buse_set_tag_set(buse);
128 |
129 | ret = blk_mq_alloc_tag_set(tag_set);
130 | if (ret)
131 | goto err_disk;
132 |
133 | blkdev->request_queue = blk_mq_init_queue_data(tag_set, buse);
134 | if (IS_ERR(blkdev->request_queue)) {
135 | ret = PTR_ERR(blkdev->request_queue);
136 | goto err_tag;
137 | }
138 |
139 | blk_queue_write_cache(blkdev->request_queue, true, false);
140 |
141 | max_hw_sectors = (buse->write_chunk_size - writelist_size) / SECTOR_SIZE;
142 | if (max_hw_sectors > buse->read_shm_size / SECTOR_SIZE)
143 | max_hw_sectors = buse->read_shm_size / SECTOR_SIZE;
144 | blk_queue_max_hw_sectors(blkdev->request_queue, max_hw_sectors);
145 |
146 | blk_queue_flag_set(QUEUE_FLAG_NONROT, blkdev->request_queue);
147 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, blkdev->request_queue);
148 | blk_queue_logical_block_size(blkdev->request_queue, buse->block_size);
149 | blk_queue_physical_block_size(blkdev->request_queue, buse->block_size);
150 |
151 | if (buse->io_min < buse->block_size || buse->io_min % buse->block_size != 0)
152 | buse->io_min = buse->block_size;
153 | blk_queue_io_min(blkdev->request_queue, buse->io_min);
154 |
155 | if (buse->io_opt < buse->block_size || buse->io_opt % buse->block_size != 0)
156 | buse->io_opt = buse->block_size;
157 | blk_queue_io_opt(blkdev->request_queue, buse->io_opt);
158 |
159 | blk_queue_max_segments(blkdev->request_queue, USHRT_MAX);
160 | blk_queue_max_segment_size(blkdev->request_queue, UINT_MAX);
161 |
162 | if (buse->can_write_same)
163 | blk_queue_max_write_same_sectors(blkdev->request_queue, UINT_MAX);
164 |
165 | if (buse->can_write_zeroes)
166 | blk_queue_max_write_zeroes_sectors(blkdev->request_queue, UINT_MAX);
167 |
168 | if (buse->can_discard) {
169 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, blkdev->request_queue);
170 | blkdev->request_queue->limits.discard_granularity = buse->block_size;
171 | blkdev->request_queue->limits.discard_alignment = buse->block_size;
172 |
173 | blk_queue_max_discard_sectors(blkdev->request_queue, UINT_MAX);
174 | blk_queue_max_discard_segments(blkdev->request_queue, USHRT_MAX);
175 | }
176 |
177 | if (buse->can_secure_erase)
178 | blk_queue_flag_set(QUEUE_FLAG_SECERASE, blkdev->request_queue);
179 |
180 | return 0;
181 |
182 | err_tag:
183 | blk_mq_free_tag_set(tag_set);
184 | err_disk:
185 | put_disk(blkdev->disk);
186 | err:
187 | return ret;
188 | }
189 |
190 | /*
191 | * Remove the block device if it was created, otherwise just cleanup tagset.
192 | */
193 | void buse_blkdev_exit(struct buse *buse)
194 | {
195 | if (buse->blkdev.created) {
196 | del_gendisk(buse->blkdev.disk);
197 | put_disk(buse->blkdev.disk);
198 | }
199 |
200 | blk_cleanup_queue(buse->blkdev.request_queue);
201 | blk_mq_free_tag_set(&buse->blkdev.tag_set);
202 | buse->blkdev.created = false;
203 | }
204 |
205 | /*
206 | * Registers the block device so that it is visible to the system.
207 | */
208 | void buse_gendisk_register(struct buse *buse)
209 | {
210 | struct gendisk *disk = buse->blkdev.disk;
211 |
212 | disk->major = buse_blkdev_major;
213 | disk->minors = buse_blkdev_max_minors;
214 | disk->first_minor = buse->index * disk->minors;
215 | disk->flags |= GENHD_FL_EXT_DEVT;
216 | disk->fops = &buse_blkdev_ops;
217 | disk->private_data = buse;
218 | disk->queue = buse->blkdev.request_queue;
219 | snprintf(disk->disk_name, DISK_NAME_LEN, "%s%llu", buse_blkdev_name, buse->index);
220 |
221 | /* Capacity needs to be set to 0, otherwise add_disk() hangs! Correct
222 | * capacity is set afterwards. */
223 | set_capacity(buse->blkdev.disk, 0);
224 | add_disk(disk);
225 | set_capacity(buse->blkdev.disk, buse->size >> SECTOR_SHIFT);
226 | }
227 |
228 | /*
229 | * Returns numa node for given queue id.
230 | */
231 | int buse_get_numa_node_for_queue_id(struct buse *buse, int queue_id)
232 | {
233 | int i;
234 | struct blk_mq_queue_map *qmap = &buse->blkdev.tag_set.map[HCTX_TYPE_DEFAULT];
235 |
236 | for_each_possible_cpu(i) {
237 | if (queue_id == qmap->mq_map[i])
238 | return cpu_to_node(i);
239 | }
240 |
241 | return NUMA_NO_NODE;
242 | }
243 |
--------------------------------------------------------------------------------
/kernel/buse-blkdev.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_BLKDEV_H
4 | #define BUSE_BLKDEV_H
5 |
6 | #include "main.h"
7 |
8 | /*
9 | * Block device initialization. All configuration parameters are set according to the configured
10 | * values in struct buse. This is only related to the block device side of the module.
11 | */
12 | int buse_blkdev_init(struct buse *buse);
13 |
14 |
15 | /*
16 | * Remove the block device if it was created, otherwiese just cleanup tagset.
17 | */
18 | void buse_blkdev_exit(struct buse *buse);
19 |
20 |
21 | /*
22 | * Registers the block device so that it is visible to the system.
23 | */
24 | void buse_gendisk_register(struct buse *buse);
25 |
26 | /*
27 | * Returns numa node for given queue id.
28 | */
29 | int buse_get_numa_node_for_queue_id(struct buse *buse, int queue_id);
30 |
31 | #endif
32 |
--------------------------------------------------------------------------------
/kernel/buse-chrdev.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "buse-blkdev.h"
12 | #include "buse-chrdev.h"
13 | #include "buse-rqueue.h"
14 | #include "buse-wqueue.h"
15 | #include "main.h"
16 |
17 | /*
18 | * Callback for mmap(). It is reserved for future usage.
19 | */
20 | static void vm_open(struct vm_area_struct *vma)
21 | {
22 | }
23 |
24 | /*
25 | * Callback for munap(). It is reserved for future usage.
26 | */
27 | static void vm_close(struct vm_area_struct *vma)
28 | {
29 | }
30 |
31 | /*
32 | * VM fault callback for write queue. First pass through the shared memory generates faults and
33 | * fills the address mapping.
34 | */
35 | static vm_fault_t vm_fault_wqueue(struct vm_fault *vmf)
36 | {
37 | struct buse_wqueue *wq = vmf->vma->vm_private_data;
38 | pgoff_t offset = vmf->pgoff << PAGE_SHIFT;
39 | struct page *page;
40 |
41 | if (offset >= wq->buse->write_shm_size)
42 | return -EFAULT;
43 |
44 | page = vmalloc_to_page(wq->shmem + offset);
45 |
46 | get_page(page);
47 | vmf->page = page;
48 |
49 | return 0;
50 | }
51 |
52 | /*
53 | * VM fault callback for read queue. First pass through the shared memory generates faults and fills
54 | * the address mapping.
55 | */
56 | static vm_fault_t vm_fault_rqueue(struct vm_fault *vmf)
57 | {
58 | struct buse_rqueue *rq = vmf->vma->vm_private_data;
59 | pgoff_t offset = vmf->pgoff << PAGE_SHIFT;
60 | struct page *page;
61 |
62 | if (offset >= rq->buse->write_shm_size)
63 | return -EFAULT;
64 |
65 | page = vmalloc_to_page(rq->shmem + offset);
66 |
67 | get_page(page);
68 | vmf->page = page;
69 |
70 | return 0;
71 | }
72 |
73 | struct buse_wqueue *inode_get_wqueue(struct inode *inode)
74 | {
75 | return container_of(inode->i_cdev, struct buse_wqueue, chrdev.cdev);
76 | }
77 |
78 | struct buse_rqueue *inode_get_rqueue(struct inode *inode)
79 | {
80 | return container_of(inode->i_cdev, struct buse_rqueue, chrdev.cdev);
81 | }
82 |
83 | /*
84 | * File close() callback for write queue. We immediately set that the queue is unbound.
85 | */
86 | static int chrdev_release_wqueue(struct inode *inode, struct file *file)
87 | {
88 | struct buse_wqueue *wq = inode_get_wqueue(inode);
89 | if (!wq || atomic_read(&wq->bound) == 0)
90 | return -EFAULT;
91 |
92 | atomic_set(&wq->bound, 0);
93 |
94 | return 0;
95 | }
96 |
97 | /*
98 | * File close() callback for read queue. We immediately set that the queue is unbound.
99 | */
100 | static int chrdev_release_rqueue(struct inode *inode, struct file *file)
101 | {
102 | struct buse_rqueue *rq = inode_get_rqueue(inode);
103 | if (!rq || atomic_read(&rq->bound) == 0)
104 | return -EFAULT;
105 |
106 | atomic_set(&rq->bound, 0);
107 |
108 | return 0;
109 | }
110 |
111 | /*
112 | * File open() callback for write queue. We immediately set that the queue is bound.
113 | */
114 | static int chrdev_open_wqueue(struct inode *inode, struct file *file)
115 | {
116 | struct buse_wqueue *wq = inode_get_wqueue(inode);
117 | if (!wq || atomic_read(&wq->bound) == 1)
118 | return -EFAULT;
119 |
120 | file->private_data = wq;
121 | buse_wqueue_bind(wq);
122 |
123 | return 0;
124 | }
125 |
126 | /*
127 | * File open() callback for read queue. We immediately set that the queue is bound.
128 | */
129 | static int chrdev_open_rqueue(struct inode *inode, struct file *file)
130 | {
131 | struct buse_rqueue *rq = inode_get_rqueue(inode);
132 | if (!rq || atomic_read(&rq->bound) == 1)
133 | return -EFAULT;
134 |
135 | file->private_data = rq;
136 | buse_rqueue_bind(rq);
137 |
138 | return 0;
139 | }
140 |
141 | static struct vm_operations_struct vm_ops_wqueue = {
142 | .close = vm_close,
143 | .fault = vm_fault_wqueue,
144 | .open = vm_open,
145 | };
146 |
147 | static struct vm_operations_struct vm_ops_rqueue = {
148 | .close = vm_close,
149 | .fault = vm_fault_rqueue,
150 | .open = vm_open,
151 | };
152 |
153 | /*
154 | * File mmap() callback for write queue.
155 | */
156 | static int chrdev_mmap_wqueue(struct file *file, struct vm_area_struct *vma)
157 | {
158 | vma->vm_ops = &vm_ops_wqueue;
159 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
160 | vma->vm_private_data = file->private_data;
161 | vm_open(vma);
162 |
163 | return 0;
164 | }
165 |
166 | /*
167 | * File mmap() callback for read queue.
168 | */
169 | static int chrdev_mmap_rqueue(struct file *file, struct vm_area_struct *vma)
170 | {
171 | vma->vm_ops = &vm_ops_rqueue;
172 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
173 | vma->vm_private_data = file->private_data;
174 | vm_open(vma);
175 |
176 | return 0;
177 | }
178 |
179 | /*
180 | * File write() callback for read queue. Writing to the read queue character device the userspace
181 | * acknowledge that the read request is done. The written value is offset to the shared memory.
182 | */
183 | ssize_t chrdev_write_rqueue(struct file *file, const char __user *buf, size_t len, loff_t *off)
184 | {
185 | u64 data_offset;
186 | unsigned long ret;
187 | struct buse_rqueue *rq = inode_get_rqueue(file->f_inode);
188 |
189 | if (len != 8) {
190 | BUG();
191 | return 0;
192 | }
193 |
194 | if (*off != 0) {
195 | BUG();
196 | return 0;
197 | }
198 |
199 | ret = copy_from_user(&data_offset, buf, len);
200 | if (ret) {
201 | pr_alert("Cannot copy\n");
202 | return -ENOMEM;
203 | }
204 |
205 | ack_read_request(rq, data_offset, false);
206 |
207 | *off = 0;
208 |
209 | return len;
210 | }
211 |
212 | /*
213 | * File write() callback for write queue. Writing to the write queue character device the userspace
214 | * acknowledge that the write chunk was processed. The written value is offset to the shared memory.
215 | */
216 | ssize_t chrdev_write_wqueue(struct file *file, const char __user *buf, size_t len, loff_t *off)
217 | {
218 | u64 chunk_offset;
219 | struct buse_wqueue *wq = inode_get_wqueue(file->f_inode);
220 | unsigned long ret = copy_from_user(&chunk_offset, buf, len);
221 |
222 | if (len != 8) {
223 | BUG();
224 | return 0;
225 | }
226 |
227 | if (*off != 0) {
228 | BUG();
229 | return 0;
230 | }
231 |
232 | if (ret) {
233 | pr_alert("Cannot copy\n");
234 | return -ENOMEM;
235 | }
236 |
237 | ack_write_request(wq, chunk_offset, false);
238 |
239 | return len;
240 | }
241 |
242 | /*
243 | * File read() callback for write queue. Userspace reads metadata about the write chunk coming to
244 | * the block device. It is number of batched writes in the chunk and offset to the shared memory
245 | * where the chunks is located.
246 | */
247 | ssize_t chrdev_read_wqueue(struct file *file, char __user *buf, size_t len, loff_t *off)
248 | {
249 | struct write_chunk *chunk;
250 | int ret;
251 | struct buse_wqueue *wq = inode_get_wqueue(file->f_inode);
252 |
253 | if (len != 16) {
254 | BUG();
255 | return 0;
256 | }
257 |
258 | if (*off != 0) {
259 | BUG();
260 | return 0;
261 | }
262 |
263 | chunk = pop_write_request_wait(wq);
264 | if (IS_ERR(chunk)) {
265 | return PTR_ERR(chunk);
266 | }
267 |
268 | ret = copy_to_user(buf, &chunk->shmem_offset, sizeof(chunk->shmem_offset));
269 | buf += sizeof(chunk->shmem_offset);
270 | if (ret) {
271 | pr_alert("copy_to_user failed\n");
272 | return -EFAULT;
273 | }
274 |
275 | ret = copy_to_user(buf, &chunk->num_writes, sizeof(chunk->num_writes));
276 | if (ret) {
277 | pr_alert("copy_to_user failed\n");
278 | return -EFAULT;
279 | }
280 |
281 | if (is_wqueue_term(chunk))
282 | kfree(chunk);
283 |
284 | return len;
285 | }
286 |
287 | /*
288 | * File read() callback for read queue. Userspace reads metadata about the read request coming to
289 | * the block device and offset to the shared memory where data should be read into.
290 | */
291 | ssize_t chrdev_read_rqueue(struct file *file, char __user *buf, size_t len, loff_t *off)
292 | {
293 | struct buse_rqueue *rq = inode_get_rqueue(file->f_inode);
294 | struct read_chunk *chunk;
295 | int ret;
296 |
297 | if (len != 24) {
298 | BUG();
299 | return 0;
300 | }
301 |
302 | if (*off != 0) {
303 | BUG();
304 | return 0;
305 | }
306 |
307 | chunk = pop_read_request_wait(rq);
308 | if (IS_ERR(chunk)) {
309 | return PTR_ERR(chunk);
310 | }
311 |
312 | ret = copy_to_user(buf, &chunk->sector, sizeof(chunk->sector));
313 | buf += sizeof(chunk->sector);
314 | if (ret) {
315 | pr_alert("copy_to_user failed\n");
316 | return -EFAULT;
317 | }
318 |
319 | ret = copy_to_user(buf, &chunk->len, sizeof(chunk->len));
320 | buf += sizeof(chunk->len);
321 | if (ret) {
322 | pr_alert("copy_to_user failed\n");
323 | return -EFAULT;
324 | }
325 |
326 | ret = copy_to_user(buf, &chunk->shmem_offset, sizeof(chunk->shmem_offset));
327 | buf += sizeof(chunk->shmem_offset);
328 | if (ret) {
329 | pr_alert("copy_to_user failed\n");
330 | return -EFAULT;
331 | }
332 |
333 | if (is_rqueue_term(chunk))
334 | kfree(chunk);
335 |
336 | return len;
337 | }
338 |
339 | const struct file_operations chrdev_fops_wqueue = {
340 | .mmap = chrdev_mmap_wqueue,
341 | .open = chrdev_open_wqueue,
342 | .owner = THIS_MODULE,
343 | .read = chrdev_read_wqueue,
344 | .write = chrdev_write_wqueue,
345 | .release = chrdev_release_wqueue,
346 | };
347 |
348 | const struct file_operations chrdev_fops_rqueue = {
349 | .mmap = chrdev_mmap_rqueue,
350 | .open = chrdev_open_rqueue,
351 | .owner = THIS_MODULE,
352 | .read = chrdev_read_rqueue,
353 | .write = chrdev_write_rqueue,
354 | .release = chrdev_release_rqueue,
355 | };
356 |
357 | /*
358 | * Init one character device corresponding to one of the queues.
359 | */
360 | static int chrdev_queue_init(struct buse_chrdev *chrdev, dev_t minor, char *name,
361 | int i, const struct file_operations *fops)
362 | {
363 | int ret;
364 |
365 | chrdev->region = minor;
366 | cdev_init(&chrdev->cdev, fops);
367 | ret = cdev_add(&chrdev->cdev, minor, 1);
368 | if (ret < 0)
369 | goto err;
370 |
371 | chrdev->dev = device_create(buse_chrdev_class, NULL, minor, NULL,"%s%d", name, i);
372 | if (IS_ERR(chrdev->dev)) {
373 | ret = PTR_ERR(chrdev->dev);
374 | goto err_cdev;
375 | }
376 |
377 | return 0;
378 |
379 | err_cdev:
380 | cdev_del(&chrdev->cdev);
381 | err:
382 | return ret;
383 | }
384 |
385 | static void chrdev_queue_exit(struct buse_chrdev *chrdev)
386 | {
387 | device_destroy(buse_chrdev_class, chrdev->region);
388 | cdev_del(&chrdev->cdev);
389 | }
390 |
391 | /*
392 | * Deallocated read queues related character devices.
393 | */
394 | static void chrdev_rqueues_exit(struct buse *buse)
395 | {
396 | int i;
397 | struct buse_queue *q = buse->queues;
398 | struct buse_rqueue *rq;
399 | dev_t minor;
400 |
401 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) {
402 | rq = &q->r;
403 | minor = rq->chrdev.region;
404 | chrdev_queue_exit(&rq->chrdev);
405 | }
406 |
407 | minor -= buse->num_queues - 1;
408 | unregister_chrdev_region(minor, buse->num_queues);
409 | }
410 |
411 | /*
412 | * Deallocated write queues related character devices.
413 | */
414 | static void chrdev_wqueues_exit(struct buse *buse)
415 | {
416 | int i;
417 | struct buse_queue *q = buse->queues;
418 | struct buse_wqueue *wq;
419 | dev_t minor;
420 |
421 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) {
422 | wq = &q->w;
423 | minor = wq->chrdev.region;
424 | chrdev_queue_exit(&wq->chrdev);
425 | }
426 |
427 | minor -= buse->num_queues - 1;
428 | unregister_chrdev_region(minor, buse->num_queues);
429 | }
430 |
431 | /*
432 | * Allocate read queues related character devices.
433 | */
434 | static int chrdev_rqueues_init(struct buse *buse)
435 | {
436 | int ret, i;
437 | struct buse_queue *q;
438 | struct buse_rqueue *rq;
439 | dev_t minor;
440 | char name[DISK_NAME_LEN];
441 | snprintf(name, DISK_NAME_LEN, "%s%llu-r", buse_blkdev_name, buse->index);
442 |
443 | ret = alloc_chrdev_region(&minor, 0, buse->num_queues, name);
444 | if (ret < 0)
445 | goto err;
446 |
447 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++, minor++) {
448 | rq = &q->r;
449 | ret = chrdev_queue_init(&rq->chrdev, minor, name, i, &chrdev_fops_rqueue);
450 | if (ret)
451 | goto err_alloc;
452 | }
453 |
454 | return 0;
455 |
456 | err_alloc:
457 | for (; i > 0; i--, q--, minor--) {
458 | rq = &q->r;
459 | chrdev_queue_exit(&rq->chrdev);
460 | }
461 |
462 | unregister_chrdev_region(minor, buse->num_queues);
463 | err:
464 | return ret;
465 | }
466 |
467 | /*
468 | * Allocate write queues related character devices.
469 | */
470 | static int chrdev_wqueues_init(struct buse *buse)
471 | {
472 | int ret, i;
473 | struct buse_queue *q;
474 | struct buse_wqueue *wq;
475 | dev_t minor;
476 | char name[DISK_NAME_LEN];
477 | snprintf(name, DISK_NAME_LEN, "%s%llu-w", buse_blkdev_name, buse->index);
478 |
479 | ret = alloc_chrdev_region(&minor, 0, buse->num_queues, name);
480 | if (ret < 0)
481 | goto err;
482 |
483 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++, minor++) {
484 | wq = &q->w;
485 | ret = chrdev_queue_init(&wq->chrdev, minor, name, i, &chrdev_fops_wqueue);
486 | if (ret)
487 | goto err_alloc;
488 | }
489 |
490 | return 0;
491 |
492 | err_alloc:
493 | for (; i > 0; i--, q--, minor--) {
494 | wq = &q->w;
495 | chrdev_queue_exit(&wq->chrdev);
496 | }
497 |
498 | unregister_chrdev_region(minor, buse->num_queues);
499 | err:
500 | return ret;
501 | }
502 |
503 | /*
504 | * Init all needed character devices for queues to the userspace.
505 | */
506 | int buse_chrdev_init(struct buse *buse)
507 | {
508 | int ret;
509 |
510 | ret = chrdev_wqueues_init(buse);
511 | if (ret)
512 | goto err;
513 |
514 | ret = chrdev_rqueues_init(buse);
515 | if (ret)
516 | goto err_wqueues;
517 |
518 | return 0;
519 |
520 | err_wqueues:
521 | chrdev_wqueues_exit(buse);
522 | err:
523 | return ret;
524 | }
525 |
526 | void buse_chrdev_exit(struct buse *buse)
527 | {
528 | chrdev_rqueues_exit(buse);
529 | chrdev_wqueues_exit(buse);
530 | }
531 |
--------------------------------------------------------------------------------
/kernel/buse-chrdev.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_CHRDEV_H
4 | #define BUSE_CHRDEV_H
5 |
6 | #include "main.h"
7 |
8 | /*
9 | * Init all needed character devices for queues to the userspace.
10 | */
11 | int buse_chrdev_init(struct buse *buse);
12 |
13 | void buse_chrdev_exit(struct buse *buse);
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/kernel/buse-configfs.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | /*
4 | * This module contains all configfs related configuration. Every configfs attribute needs to define:
5 | *
6 | * 1) buse_`attr`_show() function returning the current value.
7 | *
8 | * 2) buse_`attr`_store() setting the value and eventually doing function calls.
9 | *
10 | * 3) define macro CONFIGFS_ATTR(buse_, `attr`);
11 | *
12 | * 4) put &buse_attr_`attr` record to the buse_attrs[]
13 | *
14 | * This process can be a bit repetitive for some attributes, but we keep it like that for better
15 | * control over allowed inserted values and not obfuscating the code with unclean macros.
16 | */
17 |
18 | #include
19 |
20 | #include "buse-configfs.h"
21 | #include "buse-rqueue.h"
22 | #include "buse-wqueue.h"
23 | #include "main.h"
24 |
25 | static inline struct buse *to_buse(struct config_item *item)
26 | {
27 | return item ? container_of(item, struct buse, item) : NULL;
28 | }
29 |
30 | static ssize_t buse_power_show(struct config_item *item, char *page)
31 | {
32 | struct buse *buse = to_buse(item);
33 |
34 | return snprintf(page, PAGE_SIZE, "%d\n", buse->power);
35 | }
36 |
37 | static ssize_t buse_power_store(struct config_item *item, const char *page, size_t count)
38 | {
39 | struct buse *buse = to_buse(item);
40 | int ret;
41 | bool power;
42 | int i;
43 | struct buse_wqueue *wq;
44 | struct buse_rqueue *rq;
45 |
46 | ret = kstrtobool(page, &power);
47 | if (ret)
48 | goto err;
49 |
50 | mutex_lock(&buse->configfs_mutex);
51 |
52 | if (power == buse->power) {
53 | ret = -EINVAL;
54 | goto err_mutex;
55 | }
56 |
57 | atomic_set(&buse->stopped, !power);
58 |
59 | if (!power)
60 | buse_stop(buse);
61 |
62 | if (power && buse->queues) {
63 | for (i = 0; i < buse->num_queues; i++) {
64 | wq = &buse->queues[i].w;
65 | mutex_lock(&wq->lock);
66 | wq->terminated = false;
67 | mutex_unlock(&wq->lock);
68 | }
69 |
70 | for (i = 0; i < buse->num_queues; i++) {
71 | rq = &buse->queues[i].r;
72 | mutex_lock(&rq->lock);
73 | rq->terminated = false;
74 | mutex_unlock(&rq->lock);
75 | }
76 | }
77 |
78 | if (power && !buse->queues) {
79 | ret = buse_on(buse);
80 | if (ret)
81 | goto err_mutex;
82 | }
83 |
84 | buse->power = power;
85 | ret = count;
86 |
87 | err_mutex:
88 | mutex_unlock(&buse->configfs_mutex);
89 | err:
90 | return ret;
91 | }
92 |
93 | CONFIGFS_ATTR(buse_, power);
94 |
95 | static ssize_t buse_hw_queues_show(struct config_item *item, char *page)
96 | {
97 | struct buse *buse = to_buse(item);
98 |
99 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->hw_queues);
100 | }
101 |
102 | static ssize_t buse_hw_queues_store(struct config_item *item, const char *page, size_t count)
103 | {
104 | struct buse *buse = to_buse(item);
105 | int ret;
106 | u64 hw_queues;
107 |
108 | if (buse->power)
109 | return -EBUSY;
110 |
111 | ret = kstrtou64(page, 0, &hw_queues);
112 | if (ret)
113 | goto err;
114 |
115 | buse->hw_queues = hw_queues;
116 |
117 | return count;
118 |
119 | err:
120 | return ret;
121 | }
122 |
123 | CONFIGFS_ATTR(buse_, hw_queues);
124 |
125 | static ssize_t buse_queue_depth_show(struct config_item *item, char *page)
126 | {
127 | struct buse *buse = to_buse(item);
128 |
129 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->queue_depth);
130 | }
131 |
132 | static ssize_t buse_queue_depth_store(struct config_item *item, const char *page, size_t count)
133 | {
134 | struct buse *buse = to_buse(item);
135 | int ret;
136 | u64 queue_depth;
137 |
138 | if (buse->power)
139 | return -EBUSY;
140 |
141 | ret = kstrtou64(page, 0, &queue_depth);
142 | if (ret)
143 | goto err;
144 |
145 | buse->queue_depth = queue_depth;
146 |
147 | return count;
148 |
149 | err:
150 | return ret;
151 | }
152 |
153 | CONFIGFS_ATTR(buse_, queue_depth);
154 |
155 | static ssize_t buse_can_write_same_show(struct config_item *item, char *page)
156 | {
157 | struct buse *buse = to_buse(item);
158 |
159 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_write_same);
160 | }
161 |
162 | static ssize_t buse_can_write_same_store(struct config_item *item, const char *page, size_t count)
163 | {
164 | struct buse *buse = to_buse(item);
165 | int ret;
166 | bool can_write_same;
167 |
168 | if (buse->power)
169 | return -EBUSY;
170 |
171 | ret = kstrtobool(page, &can_write_same);
172 | if (ret)
173 | goto err;
174 |
175 | buse->can_write_same = can_write_same;
176 |
177 | return count;
178 |
179 | err:
180 | return ret;
181 | }
182 |
183 | CONFIGFS_ATTR(buse_, can_write_same);
184 |
185 | static ssize_t buse_can_write_zeroes_show(struct config_item *item, char *page)
186 | {
187 | struct buse *buse = to_buse(item);
188 |
189 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_write_zeroes);
190 | }
191 |
192 | static ssize_t buse_can_write_zeroes_store(struct config_item *item, const char *page, size_t count)
193 | {
194 | struct buse *buse = to_buse(item);
195 | int ret;
196 | bool can_write_zeroes;
197 |
198 | if (buse->power)
199 | return -EBUSY;
200 |
201 | ret = kstrtobool(page, &can_write_zeroes);
202 | if (ret)
203 | goto err;
204 |
205 | buse->can_write_zeroes = can_write_zeroes;
206 |
207 | return count;
208 |
209 | err:
210 | return ret;
211 | }
212 |
213 | CONFIGFS_ATTR(buse_, can_write_zeroes);
214 |
215 | static ssize_t buse_can_discard_show(struct config_item *item, char *page)
216 | {
217 | struct buse *buse = to_buse(item);
218 |
219 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_discard);
220 | }
221 |
222 | static ssize_t buse_can_discard_store(struct config_item *item, const char *page, size_t count)
223 | {
224 | struct buse *buse = to_buse(item);
225 | int ret;
226 | bool can_discard;
227 |
228 | if (buse->power)
229 | return -EBUSY;
230 |
231 | ret = kstrtobool(page, &can_discard);
232 | if (ret)
233 | goto err;
234 |
235 | buse->can_discard = can_discard;
236 |
237 | return count;
238 |
239 | err:
240 | return ret;
241 | }
242 |
243 | CONFIGFS_ATTR(buse_, can_discard);
244 |
245 | static ssize_t buse_can_secure_erase_show(struct config_item *item, char *page)
246 | {
247 | struct buse *buse = to_buse(item);
248 |
249 | return snprintf(page, PAGE_SIZE, "%d\n", buse->can_secure_erase);
250 | }
251 |
252 | static ssize_t buse_can_secure_erase_store(struct config_item *item, const char *page, size_t count)
253 | {
254 | struct buse *buse = to_buse(item);
255 | int ret;
256 | bool can_secure_erase;
257 |
258 | if (buse->power)
259 | return -EBUSY;
260 |
261 | ret = kstrtobool(page, &can_secure_erase);
262 | if (ret)
263 | goto err;
264 |
265 | buse->can_secure_erase = can_secure_erase;
266 |
267 | return count;
268 |
269 | err:
270 | return ret;
271 | }
272 |
273 | CONFIGFS_ATTR(buse_, can_secure_erase);
274 |
275 | static ssize_t buse_no_scheduler_show(struct config_item *item, char *page)
276 | {
277 | struct buse *buse = to_buse(item);
278 |
279 | return snprintf(page, PAGE_SIZE, "%d\n", buse->no_scheduler);
280 | }
281 |
282 | static ssize_t buse_no_scheduler_store(struct config_item *item, const char *page, size_t count)
283 | {
284 | struct buse *buse = to_buse(item);
285 | int ret;
286 | bool no_scheduler;
287 |
288 | if (buse->power)
289 | return -EBUSY;
290 |
291 | ret = kstrtobool(page, &no_scheduler);
292 | if (ret)
293 | goto err;
294 |
295 | buse->no_scheduler = no_scheduler;
296 |
297 | return count;
298 |
299 | err:
300 | return ret;
301 | }
302 |
303 | CONFIGFS_ATTR(buse_, no_scheduler);
304 |
305 | static ssize_t buse_read_shm_size_show(struct config_item *item, char *page)
306 | {
307 | struct buse *buse = to_buse(item);
308 |
309 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->read_shm_size);
310 | }
311 |
312 | static ssize_t buse_read_shm_size_store(struct config_item *item, const char *page, size_t count)
313 | {
314 | struct buse *buse = to_buse(item);
315 | int ret;
316 | u64 read_shm_size;
317 |
318 | if (buse->power)
319 | return -EBUSY;
320 |
321 | ret = kstrtou64(page, 0, &read_shm_size);
322 | if (ret)
323 | goto err;
324 |
325 | buse->read_shm_size = read_shm_size;
326 |
327 | return count;
328 |
329 | err:
330 | return ret;
331 | }
332 |
333 | CONFIGFS_ATTR(buse_, read_shm_size);
334 |
335 | static ssize_t buse_write_shm_size_show(struct config_item *item, char *page)
336 | {
337 | struct buse *buse = to_buse(item);
338 |
339 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->write_shm_size);
340 | }
341 |
342 | static ssize_t buse_write_shm_size_store(struct config_item *item, const char *page, size_t count)
343 | {
344 | struct buse *buse = to_buse(item);
345 | int ret;
346 | u64 write_shm_size;
347 |
348 | if (buse->power)
349 | return -EBUSY;
350 |
351 | ret = kstrtou64(page, 0, &write_shm_size);
352 | if (ret)
353 | goto err;
354 |
355 | buse->write_shm_size = write_shm_size;;
356 |
357 | return count;
358 |
359 | err:
360 | return ret;
361 | }
362 |
363 | CONFIGFS_ATTR(buse_, write_shm_size);
364 |
365 | static ssize_t buse_write_chunk_size_show(struct config_item *item, char *page)
366 | {
367 | struct buse *buse = to_buse(item);
368 |
369 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->write_chunk_size);
370 | }
371 |
372 | static ssize_t buse_write_chunk_size_store(struct config_item *item, const char *page, size_t count)
373 | {
374 | struct buse *buse = to_buse(item);
375 | int ret;
376 | u64 write_chunk_size;
377 |
378 | if (buse->power)
379 | return -EBUSY;
380 |
381 | ret = kstrtou64(page, 0, &write_chunk_size);
382 | if (ret)
383 | goto err;
384 |
385 | buse->write_chunk_size = write_chunk_size;
386 |
387 | return count;
388 |
389 | err:
390 | return ret;
391 | }
392 |
393 | CONFIGFS_ATTR(buse_, write_chunk_size);
394 |
395 | static ssize_t buse_blocksize_show(struct config_item *item, char *page)
396 | {
397 | struct buse *buse = to_buse(item);
398 |
399 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->block_size);
400 | }
401 |
402 | static ssize_t buse_blocksize_store(struct config_item *item, const char *page, size_t count)
403 | {
404 | struct buse *buse = to_buse(item);
405 | int ret;
406 | u64 blocksize;
407 |
408 | if (buse->power)
409 | return -EBUSY;
410 |
411 | ret = kstrtou64(page, 0, &blocksize);
412 | if (ret)
413 | goto err;
414 |
415 | buse->block_size = blocksize;
416 |
417 | return count;
418 |
419 | err:
420 | return ret;
421 | }
422 |
423 | CONFIGFS_ATTR(buse_, blocksize);
424 |
425 | static ssize_t buse_io_min_show(struct config_item *item, char *page)
426 | {
427 | struct buse *buse = to_buse(item);
428 |
429 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->io_min);
430 | }
431 |
432 | static ssize_t buse_io_min_store(struct config_item *item, const char *page, size_t count)
433 | {
434 | struct buse *buse = to_buse(item);
435 | int ret;
436 | u64 io_min;
437 |
438 | if (buse->power)
439 | return -EBUSY;
440 |
441 | ret = kstrtou64(page, 0, &io_min);
442 | if (ret)
443 | goto err;
444 |
445 | buse->io_min = io_min;
446 |
447 | return count;
448 |
449 | err:
450 | return ret;
451 | }
452 |
453 | CONFIGFS_ATTR(buse_, io_min);
454 |
455 | static ssize_t buse_io_opt_show(struct config_item *item, char *page)
456 | {
457 | struct buse *buse = to_buse(item);
458 |
459 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->io_opt);
460 | }
461 |
462 | static ssize_t buse_io_opt_store(struct config_item *item, const char *page, size_t count)
463 | {
464 | struct buse *buse = to_buse(item);
465 | int ret;
466 | u64 io_opt;
467 |
468 | if (buse->power)
469 | return -EBUSY;
470 |
471 | ret = kstrtou64(page, 0, &io_opt);
472 | if (ret)
473 | goto err;
474 |
475 | buse->io_opt = io_opt;
476 |
477 | return count;
478 |
479 | err:
480 | return ret;
481 | }
482 |
483 | CONFIGFS_ATTR(buse_, io_opt);
484 |
485 | static ssize_t buse_size_show(struct config_item *item, char *page)
486 | {
487 | struct buse *buse = to_buse(item);
488 |
489 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->size);
490 | }
491 |
492 | static ssize_t buse_size_store(struct config_item *item, const char *page, size_t count)
493 | {
494 | struct buse *buse = to_buse(item);
495 | int ret;
496 | u64 size;
497 |
498 | if (buse->power)
499 | return -EBUSY;
500 |
501 | ret = kstrtou64(page, 0, &size);
502 | if (ret)
503 | goto err;
504 |
505 | buse->size = size;
506 |
507 | return count;
508 |
509 | err:
510 | return ret;
511 | }
512 |
513 | CONFIGFS_ATTR(buse_, size);
514 |
515 | static ssize_t buse_collision_area_size_show(struct config_item *item, char *page)
516 | {
517 | struct buse *buse = to_buse(item);
518 |
519 | return snprintf(page, PAGE_SIZE, "%llu\n", buse->collision_area_size);
520 | }
521 |
522 | static ssize_t buse_collision_area_size_store(struct config_item *item, const char *page, size_t count)
523 | {
524 | struct buse *buse = to_buse(item);
525 | int ret;
526 | u64 collision_area_size;
527 |
528 | if (buse->power)
529 | return -EBUSY;
530 |
531 | ret = kstrtou64(page, 0, &collision_area_size);
532 | if (ret)
533 | goto err;
534 |
535 | if (collision_area_size % buse->block_size != 0 ||
536 | collision_area_size > buse->size)
537 | collision_area_size = buse->block_size;
538 |
539 | buse->collision_area_size = collision_area_size;
540 |
541 | return count;
542 |
543 | err:
544 | return ret;
545 | }
546 |
547 | CONFIGFS_ATTR(buse_, collision_area_size);
548 |
549 | static struct configfs_attribute *buse_attrs[] = {
550 | &buse_attr_collision_area_size,
551 | &buse_attr_size,
552 | &buse_attr_blocksize,
553 | &buse_attr_io_min,
554 | &buse_attr_io_opt,
555 | &buse_attr_write_chunk_size,
556 | &buse_attr_write_shm_size,
557 | &buse_attr_read_shm_size,
558 | &buse_attr_hw_queues,
559 | &buse_attr_queue_depth,
560 | &buse_attr_no_scheduler,
561 | &buse_attr_can_secure_erase,
562 | &buse_attr_can_write_same,
563 | &buse_attr_can_write_zeroes,
564 | &buse_attr_can_discard,
565 | &buse_attr_power,
566 | NULL,
567 | };
568 |
569 | static void buse_release(struct config_item *item)
570 | {
571 | struct buse *buse = to_buse(item);
572 |
573 | if (buse->power)
574 | return;
575 | }
576 |
577 | static struct configfs_item_operations buse_ops = {
578 | .release = buse_release,
579 | };
580 |
581 | static const struct config_item_type buse_type = {
582 | .ct_item_ops = &buse_ops,
583 | .ct_attrs = buse_attrs,
584 | .ct_owner = THIS_MODULE,
585 | };
586 |
587 | static struct config_item *buse_group_make_item(struct config_group *group, const char *name)
588 | {
589 | struct buse *buse;
590 | uint index;
591 | int ret;
592 |
593 | ret = kstrtouint(name, 0, &index);
594 | if (ret < 0)
595 | return ERR_PTR(ret);
596 |
597 | buse = buse_add(index);
598 | if (IS_ERR(buse))
599 | return ERR_PTR(-ENOMEM);
600 |
601 | config_item_init_type_name(&buse->item, name, &buse_type);
602 |
603 | return &buse->item;
604 | }
605 |
606 | static void buse_group_drop_item(struct config_group *group, struct config_item *item)
607 | {
608 | struct buse *buse = to_buse(item);
609 |
610 | mutex_lock(&buse->configfs_mutex);
611 |
612 | if (buse->power)
613 | goto err;
614 |
615 | buse_off(buse);
616 | buse_del(buse);
617 | config_item_put(item);
618 |
619 | err:
620 | mutex_unlock(&buse->configfs_mutex);
621 | }
622 |
623 | static struct configfs_group_operations buse_group_ops = {
624 | .make_item = buse_group_make_item,
625 | .drop_item = buse_group_drop_item,
626 | };
627 |
628 | static const struct config_item_type buse_group_type = {
629 | .ct_group_ops = &buse_group_ops,
630 | .ct_owner = THIS_MODULE,
631 | };
632 |
633 | static struct configfs_subsystem buse_subsys = {
634 | .su_group = {
635 | .cg_item = {
636 | .ci_namebuf = "buse",
637 | .ci_type = &buse_group_type,
638 | },
639 | },
640 | };
641 |
642 | /*
643 | * Initialize configfs subsystem. Later on it is used for all the operation with the kernel module.
644 | */
645 | int buse_configfs_init(void)
646 | {
647 | int ret;
648 |
649 | config_group_init(&buse_subsys.su_group);
650 | mutex_init(&buse_subsys.su_mutex);
651 | ret = configfs_register_subsystem(&buse_subsys);
652 | if (ret)
653 | goto err;
654 |
655 | return 0;
656 |
657 | err:
658 | return ret;
659 | }
660 |
661 | /*
662 | * Deinit of configfs.
663 | */
664 | void buse_configfs_exit(void)
665 | {
666 | configfs_unregister_subsystem(&buse_subsys);
667 | }
668 |
--------------------------------------------------------------------------------
/kernel/buse-configfs.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_CONFIGFS_H
4 | #define BUSE_CONFIGFS_H
5 |
6 | /*
7 | * Initialize configfs subsystem. Later on it is used for all the operation with the kernel module.
8 | */
9 | int buse_configfs_init(void);
10 |
11 | /*
12 | * Deinit of configfs.
13 | */
14 | void buse_configfs_exit(void);
15 |
16 | #endif
17 |
--------------------------------------------------------------------------------
/kernel/buse-rqueue.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #include "buse-blkdev.h"
15 | #include "buse-chrdev.h"
16 | #include "buse-rqueue.h"
17 | #include "buse-wqueue.h"
18 | #include "main.h"
19 |
20 | /*
21 | * Copy data from the shared memory to the memory specified by the io request.
22 | */
23 | static void copy_to_request(struct request *rq, char *src)
24 | {
25 | char *dst;
26 | size_t len;
27 | struct bio_vec bvec;
28 | struct req_iterator iter;
29 |
30 | rq_for_each_segment(bvec, rq, iter) {
31 | len = bvec.bv_len;
32 | dst = kmap_atomic(bvec.bv_page);
33 | memcpy(dst + bvec.bv_offset, src, len);
34 | kunmap_atomic(dst);
35 | src += len;
36 | }
37 | }
38 |
39 | /*
40 | * Acknowledge from the userspace that the read is done. If draining is true, it means that we are
41 | * shutting down and we are no longer servig to the userspace daemon.
42 | *
43 | * Data are copied from the shared memory (filled by user space) to the io request destination. Then
44 | * the bitmap tracking the free space in shared memory is updated and read requests is finished.
45 | */
46 | void ack_read_request(struct buse_rqueue *rq, u64 shmem_offset, bool draining)
47 | {
48 | struct buse *buse = rq->buse;
49 | struct read_chunk *ch;
50 | int shmem_offset_block = shmem_offset / buse->block_size;
51 | int shmem_offset_blocks_cnt = buse->read_shm_size / buse->block_size;
52 |
53 | if (shmem_offset % buse->block_size ||
54 | shmem_offset_block >= shmem_offset_blocks_cnt) {
55 | BUG();
56 |
57 | }
58 |
59 | mutex_lock(&rq->lock);
60 |
61 | ch = rq->chunk_from_bitmap[shmem_offset_block];
62 | rq->chunk_from_bitmap[shmem_offset_block] = NULL;
63 |
64 | /* TODO: Use blk_mq_complete_request_remote() to finish on local numa node */
65 |
66 | copy_to_request(ch->cmd->rq, rq->shmem + shmem_offset);
67 |
68 | bitmap_release_region(rq->free_chunks_bitmap, shmem_offset_block,
69 | order_base_2(blk_rq_bytes(ch->cmd->rq) / buse->block_size));
70 |
71 | list_del_init(&ch->list);
72 |
73 | if (draining)
74 | blk_mq_end_request(ch->cmd->rq, BLK_STS_IOERR);
75 | else
76 | blk_mq_end_request(ch->cmd->rq, BLK_STS_OK);
77 |
78 | kfree(ch);
79 |
80 | wake_up(&rq->free_chunks_avail);
81 | mutex_unlock(&rq->lock);
82 | }
83 |
84 | /*
85 | * If the read chunk is actually a termination chunk leading to device shutdown.
86 | */
87 | bool is_rqueue_term(struct read_chunk *ch)
88 | {
89 | return ch->shmem_offset == -1;
90 | }
91 |
92 | /*
93 | * Pulls read chunk from the busy queue and returns it. If there is no read chunk in the busy queue,
94 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the
95 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and
96 | * potential rerun fetched but not yet acknowledged chunks.
97 | */
98 | struct read_chunk *pop_read_request_wait(struct buse_rqueue *rq)
99 | {
100 | int ret;
101 | struct read_chunk *ch = NULL;
102 |
103 | ret = wait_event_interruptible(rq->busy_chunks_avail, !list_empty(&rq->busy_chunks));
104 | if (ret < 0)
105 | return ERR_PTR(-EAGAIN);
106 |
107 | mutex_lock(&rq->lock);
108 |
109 | BUG_ON(list_empty(&rq->busy_chunks));
110 |
111 | ch = list_first_entry(&rq->busy_chunks, struct read_chunk, list);
112 | list_del_init(&ch->list);
113 |
114 | if (!is_rqueue_term(ch))
115 | list_add_tail(&ch->list, &rq->fetched_chunks);
116 | mutex_unlock(&rq->lock);
117 |
118 | return ch;
119 | }
120 |
121 | /*
122 | * Allocates space in shared memory for a new read chunk corresponding to the cmd.
123 | */
124 | static struct read_chunk *create_read_chunk(struct buse_cmd *cmd)
125 | {
126 | struct buse_queue *q = cmd->queue;
127 | struct buse_rqueue *rq = &q->r;
128 | struct buse *buse = q->r.buse;
129 | struct request *r = cmd->rq;
130 | size_t len = blk_rq_sectors(r);
131 | size_t sector = blk_rq_pos(r);
132 | int chunk_index;
133 | int ret;
134 | struct read_chunk *ch;
135 |
136 | size_t shmem_blocks = buse->read_shm_size / buse->block_size;
137 |
138 | chunk_index = bitmap_find_free_region(rq->free_chunks_bitmap, shmem_blocks, order_base_2(len * SECTOR_SIZE / buse->block_size));
139 | if (chunk_index < 0) {
140 | ret = -EFAULT;
141 | goto err;
142 | }
143 |
144 | ch = kmalloc(sizeof(*ch), GFP_KERNEL);
145 | if (!ch) {
146 | ret = -ENOMEM;
147 | goto err_bitmap;
148 | }
149 |
150 | ch->len = len;
151 | ch->sector = sector;
152 | ch->cmd = cmd;
153 | ch->shmem_offset = chunk_index * buse->block_size;
154 | rq->chunk_from_bitmap[chunk_index] = ch;
155 |
156 | return ch;
157 |
158 | err_bitmap:
159 | bitmap_release_region(rq->free_chunks_bitmap, chunk_index, order_base_2(len * SECTOR_SIZE / buse->block_size));
160 | err:
161 | return ERR_PTR(ret);
162 | }
163 |
164 | /*
165 | * Creates a read chunk and puts it to the busy queue. The chunks is fetched from the busy queue by
166 | * the user space. The busy queue is woken up, in case it slept.
167 | */
168 | blk_status_t buse_read_plain(struct buse_cmd *cmd)
169 | {
170 | struct buse_queue *q = cmd->queue;
171 | struct buse_rqueue *rq = &q->r;
172 | struct read_chunk *ch;
173 | struct buse *buse = rq->buse;
174 | size_t len = (u64)blk_rq_bytes(cmd->rq) / buse->block_size;
175 |
176 | again:
177 | if (cmd->canceled) {
178 | blk_mq_end_request(cmd->rq, BLK_STS_IOERR);
179 | return BLK_STS_IOERR;
180 | }
181 |
182 | mutex_lock(&rq->lock);
183 |
184 | if (rq->terminated) {
185 | blk_mq_end_request(cmd->rq, BLK_STS_IOERR);
186 | mutex_unlock(&rq->lock);
187 | return BLK_STS_IOERR;
188 | }
189 |
190 | ch = create_read_chunk(cmd);
191 | if (IS_ERR(ch)) {
192 | size_t shmem_blocks = buse->read_shm_size / buse->block_size;
193 | mutex_unlock(&rq->lock);
194 | wait_event_interruptible(rq->free_chunks_avail,
195 | bitmap_find_next_zero_area(rq->free_chunks_bitmap, shmem_blocks, 0, len, 0)
196 | < shmem_blocks - len);
197 | goto again;
198 | }
199 |
200 | list_add_tail(&ch->list, &rq->busy_chunks);
201 | blk_mq_start_request(cmd->rq);
202 | wake_up(&rq->busy_chunks_avail);
203 | mutex_unlock(&rq->lock);
204 |
205 | return BLK_STS_OK;
206 | }
207 |
208 | /*
209 | * Sends termination chunk to the rq.
210 | */
211 | void rqueue_send_term(struct buse_rqueue *rq)
212 | {
213 | struct read_chunk *ch;
214 | size_t shmem_blocks = rq->buse->read_shm_size / rq->buse->block_size;
215 | again:
216 | mutex_lock(&rq->lock);
217 |
218 | if (!bitmap_empty(rq->free_chunks_bitmap, shmem_blocks)) {
219 | mutex_unlock(&rq->lock);
220 | wait_event_interruptible(rq->free_chunks_avail, bitmap_empty(rq->free_chunks_bitmap, shmem_blocks));
221 | goto again;
222 | }
223 |
224 |
225 | if (wq_has_sleeper(&rq->free_chunks_avail)) {
226 | wake_up(&rq->free_chunks_avail);
227 | mutex_unlock(&rq->lock);
228 | goto again;
229 | }
230 |
231 | ch = kzalloc(sizeof(*ch), GFP_KERNEL);
232 | if (!ch) {
233 | pr_alert("Cannot allocate termination packet! Check traffic and shut down manually!\n");
234 | return;
235 | }
236 |
237 | ch->shmem_offset = (u64)-1;
238 | list_add_tail(&ch->list, &rq->busy_chunks);
239 | rq->terminated = true;
240 | wake_up(&rq->busy_chunks_avail);
241 |
242 | mutex_unlock(&rq->lock);
243 | }
244 |
245 | static bool overlaps(size_t x, size_t x_len, size_t y, size_t y_len)
246 | {
247 | return ((x <= y && x + x_len > y) || (x >= y && x < y + y_len));
248 | }
249 |
250 | /*
251 | * Adds a dependent read to the write chunk. When that write chunk is acknowledged, all dependent
252 | * reads are allowed to be send to userspace.
253 | */
254 | static int read_dep_add(struct buse_cmd *cmd, struct write_chunk *ch)
255 | {
256 | struct rq_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
257 | if (!node)
258 | return -ENOMEM;
259 |
260 | node->rq = cmd->rq;
261 | atomic_inc(&cmd->read.write_deps);
262 | list_add_tail(&node->list, &ch->dependent_reads);
263 |
264 | return 0;
265 | }
266 |
267 | /*
268 | * Checks if the command has conflict with any of writes in the write chunk. Conflict means that
269 | * read reads data written be the write. This is read after write hazard.
270 | */
271 | static bool is_read_dep_conflict(struct buse_cmd *cmd, struct write_chunk *ch)
272 | {
273 | size_t sector = blk_rq_pos(cmd->rq);
274 | size_t len = blk_rq_sectors(cmd->rq);
275 | int i;
276 | struct writelist_item *w;
277 |
278 | if (!ch || is_flush_packet(ch))
279 | return false;
280 |
281 | w = ch->writelist_frontier - ch->num_writes;
282 | for (i = 0; i < ch->num_writes; i++, w++)
283 | if (overlaps(sector, len, w->sector, w->len))
284 | return true;
285 |
286 | return false;
287 | }
288 |
289 | /*
290 | * First checks for read after write hazards and add potential conflicting reads to the appropriate
291 | * write chunks. If no conflict was found and there is no more queues to check the read processed.
292 | * Otherwise the read is processed as a callback when depending write chunk is acknowledged.
293 | */
294 | static int rqueue_read_checked(void *data)
295 | {
296 | int ret;
297 | struct cmd_q_args *args = data;
298 | struct buse_cmd *cmd = args->cmd;
299 | struct buse_queue *q = args->q;
300 | struct buse_wqueue *wq = &q->w;
301 | struct write_chunk *ch;
302 |
303 | mutex_lock(&wq->lock);
304 | list_for_each_entry(ch, &wq->busy_chunks, list)
305 | if (is_read_dep_conflict(cmd, ch)) {
306 | ret = read_dep_add(cmd, ch);
307 | if (ret) {
308 | pr_alert("Cannot add read dep from busy_chunks\n");
309 | goto err;
310 | }
311 | }
312 |
313 | list_for_each_entry(ch, &wq->fetched_chunks, list)
314 | if (is_read_dep_conflict(cmd, ch)) {
315 | ret = read_dep_add(cmd, ch);
316 | if (ret) {
317 | pr_alert("Cannot add read dep from fetched_chunks\n");
318 | goto err;
319 | }
320 | }
321 |
322 | if (is_read_dep_conflict(cmd, wq->active_chunk)) {
323 | ret = read_dep_add(cmd, wq->active_chunk);
324 | if (ret) {
325 | pr_alert("Cannot add read dep from active_chunk\n");
326 | goto err;
327 | }
328 | close_chunk(wq);
329 | }
330 |
331 | goto ret;
332 |
333 | err:
334 | cmd->canceled = true;
335 | ret:
336 | mutex_unlock(&wq->lock);
337 | if (atomic_dec_and_test(&args->cmd->read.queues_pending) &&
338 | atomic_read(&args->cmd->read.write_deps) == 0 &&
339 | atomic_cmpxchg(&args->cmd->read.queues_pending, 0, 1) == 0) {
340 | buse_read_plain(args->cmd);
341 | }
342 |
343 | kfree(data);
344 |
345 | /* Here it depends on whether sequential or threaded version is used. */
346 | return 0; /* For sequential version */
347 | /* do_exit(0); */ /* For threaded version */
348 | }
349 |
350 | /*
351 | * Spawns checked reads on all queues.
352 | */
353 | blk_status_t buse_read(struct buse_cmd *cmd)
354 | {
355 | int i;
356 | struct cmd_q_args *args;
357 | struct buse_queue *q = cmd->queue;
358 | struct buse *buse = q->r.buse;
359 | size_t num_queues = buse->num_queues;
360 |
361 | atomic_set(&cmd->read.write_deps, 0);
362 | atomic_set(&cmd->read.queues_pending, num_queues);
363 |
364 | for (i = 0; i < num_queues; i++) {
365 | args = kzalloc(sizeof(*args), GFP_KERNEL);
366 | if (!args)
367 | goto err;
368 |
369 | args->cmd = cmd;
370 | args->q = &buse->queues[i];
371 |
372 |
373 | /*
374 | * Asynchronous version
375 | * if (kthread_run(rqueue_read_checked, args, "buse-queue_read_checked_th%d", i) < 0) {
376 | * pr_alert("Cannot spawn rqueue_read_checked thread!\n");
377 | * goto err_args;
378 | * }
379 | */
380 |
381 | rqueue_read_checked(args); /* Sequential version */
382 | }
383 |
384 | return BLK_STS_OK;
385 |
386 | /* err_args: */
387 | kfree(args);
388 | err:
389 | atomic_sub(num_queues - i, &cmd->read.queues_pending);
390 | cmd->canceled = true;
391 |
392 | if (!i)
393 | return BLK_STS_AGAIN;
394 |
395 | return BLK_STS_OK;
396 | }
397 |
398 | /*
399 | * Drains all the queues because the is shutting down non-gracefully and we don't want memory leaks.
400 | */
401 | static void rqueue_drain(struct buse_rqueue *rq)
402 | {
403 | struct read_chunk *chunk;
404 | uint r_chunks = rq->buse->read_shm_size / rq->buse->block_size;
405 | int i;
406 |
407 | for (i = 0; i < r_chunks; i++) {
408 | size_t offset = i * rq->buse->block_size;
409 | if (rq->chunk_from_bitmap[i])
410 | ack_read_request(rq, offset, true);
411 | }
412 |
413 | while (!list_empty(&rq->busy_chunks)) {
414 | chunk = list_first_entry(&rq->busy_chunks, struct read_chunk, list);
415 | mutex_unlock(&rq->lock);
416 | if (is_rqueue_term(chunk)) {
417 | mutex_lock(&rq->lock);
418 | list_del_init(&chunk->list);
419 | mutex_unlock(&rq->lock);
420 | kfree(chunk);
421 | } else
422 | ack_read_request(rq, chunk->shmem_offset, true);
423 | mutex_lock(&rq->lock);
424 | }
425 | }
426 |
427 | /*
428 | * Deallocates the read queue.
429 | */
430 | static void rqueue_exit(struct buse_rqueue *rq)
431 | {
432 | rqueue_drain(rq);
433 | kfree(rq->chunk_from_bitmap);
434 | bitmap_free(rq->free_chunks_bitmap);
435 | vfree(rq->shmem);
436 | }
437 |
438 | /*
439 | * Allocates the read queue.
440 | */
441 | static int rqueue_init(struct buse_rqueue *rq)
442 | {
443 | int ret;
444 | struct buse *buse = rq->buse;
445 | uint r_chunks = buse->read_shm_size / buse->block_size;
446 | int numa_node = buse_get_numa_node_for_queue_id(rq->buse, rq->q->id);
447 |
448 | init_waitqueue_head(&rq->busy_chunks_avail);
449 | init_waitqueue_head(&rq->free_chunks_avail);
450 | INIT_LIST_HEAD(&rq->busy_chunks);
451 | INIT_LIST_HEAD(&rq->fetched_chunks);
452 | mutex_init(&rq->lock);
453 |
454 | rq->size = buse->read_shm_size;
455 |
456 | rq->shmem = vmalloc_node(rq->size, numa_node);
457 | if (!rq->shmem) {
458 | ret = -ENOMEM;
459 | goto err;
460 | }
461 |
462 | rq->free_chunks_bitmap = kmalloc_array_node(BITS_TO_LONGS(r_chunks), sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO, numa_node);
463 | if (!rq->free_chunks_bitmap) {
464 | ret = -ENOMEM;
465 | goto err_shmem;
466 | }
467 |
468 | rq->chunk_from_bitmap = kcalloc_node(r_chunks, sizeof(struct read_chunk *), GFP_KERNEL, numa_node);
469 | if (!rq->chunk_from_bitmap) {
470 | ret = -ENOMEM;
471 | goto err_bitmap;
472 | }
473 |
474 | return 0;
475 |
476 | err_bitmap:
477 | bitmap_free(rq->free_chunks_bitmap);
478 | err_shmem:
479 | vfree(rq->shmem);
480 | err:
481 | return ret;
482 | }
483 |
484 | /*
485 | * Init all read queues.
486 | */
487 | int buse_rqueues_init(struct buse *buse)
488 | {
489 | int ret, i;
490 | struct buse_queue *q;
491 |
492 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) {
493 | q->r.buse = buse;
494 | q->r.q = q;
495 | ret = rqueue_init(&q->r);
496 | if (ret) {
497 | i++;
498 | q++;
499 | goto err;
500 | }
501 | }
502 |
503 | return 0;
504 |
505 | err:
506 | for (i--, q--; i > 0; i--, q--)
507 | rqueue_exit(&q->r);
508 |
509 | return ret;
510 | }
511 |
512 | /*
513 | * Deinit all read queues.
514 | */
515 | int buse_rqueues_exit(struct buse *buse)
516 | {
517 | int i;
518 | struct buse_queue *q;
519 |
520 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++)
521 | rqueue_exit(&q->r);
522 |
523 | return 0;
524 | }
525 |
526 | /*
527 | * Rerun all fetched chunks by the user space again. This is called when user space failes without
528 | * acknowledging read chunks and reconnects again.
529 | */
530 | static void rerun_read_chunks(struct buse_rqueue *rq)
531 | {
532 | struct read_chunk *ch;
533 |
534 | mutex_lock(&rq->lock);
535 | while (!list_empty(&rq->fetched_chunks)) {
536 | ch = list_last_entry(&rq->fetched_chunks, struct read_chunk, list);
537 | list_del_init(&ch->list);
538 | list_add(&ch->list, &rq->busy_chunks);
539 | }
540 | wake_up(&rq->busy_chunks_avail);
541 | mutex_unlock(&rq->lock);
542 | }
543 |
544 | /*
545 | * Set the queue to be bound.
546 | */
547 | void buse_rqueue_bind(struct buse_rqueue *rq)
548 | {
549 | mutex_lock(&rq->buse->configfs_mutex);
550 | atomic_set(&rq->bound, 1);
551 | mutex_unlock(&rq->buse->configfs_mutex);
552 | buse_blkdev_init_cond(rq->buse);
553 | rerun_read_chunks(rq);
554 | }
555 |
556 | /*
557 | * Returns true if all read queues are bound. I.e. have connected the userspace counterpart.
558 | */
559 | bool buse_rqueues_bound(struct buse *buse)
560 | {
561 | int i;
562 | struct buse_rqueue *rq;
563 |
564 | for (i = 0; i < buse->num_queues; i++) {
565 | rq = &buse->queues[i].r;
566 | if (atomic_read(&rq->bound) == 0)
567 | return false;
568 | }
569 |
570 | return true;
571 | }
572 |
--------------------------------------------------------------------------------
/kernel/buse-rqueue.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_RQUEUE_H
4 | #define BUSE_RQUEUE_H
5 |
6 | #include
7 | #include "main.h"
8 |
9 | /*
10 | * Spawns checked reads on all queues.
11 | */
12 | blk_status_t buse_read(struct buse_cmd *cmd);
13 |
14 | /*
15 | * Creates a read chunk and puts it to the busy queue. The chunks is fetched from the busy queue by
16 | * the user space. The busy queue is woken up, in case it slept.
17 | */
18 | blk_status_t buse_read_plain(struct buse_cmd *cmd);
19 |
20 | /*
21 | * Init all read queues.
22 | */
23 | int buse_rqueues_init(struct buse *buse);
24 |
25 | /*
26 | * Deinit all read queues.
27 | */
28 | int buse_rqueues_exit(struct buse *buse);
29 |
30 | /*
31 | * Pulls read chunk from the busy queue and returns it. If there is no read chunk in the busy queue,
32 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the
33 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and
34 | * potential rerun fetched but not yet acknowledged chunks.
35 | */
36 | struct read_chunk *pop_read_request_wait(struct buse_rqueue *rq);
37 |
38 | /*
39 | * Acknowledge from the userspace that the read is done. If draining is true, it means that we are
40 | * shutting down and we are no longer servig to the userspace daemon.
41 | *
42 | * Data are copied from the shared memory (filled by user space) to the io request destination. Then
43 | * the bitmap tracking the free space in shared memory is updated and read requests is finished.
44 | */
45 | void ack_read_request(struct buse_rqueue *rqueue, u64 shmem_offset, bool draining);
46 |
47 | /*
48 | * Returns true if all read queues are bound. I.e. have connected the userspace counterpart.
49 | */
50 | bool buse_rqueues_bound(struct buse *buse);
51 |
52 | /*
53 | * Set the queue to be bound.
54 | */
55 | void buse_rqueue_bind(struct buse_rqueue *rq);
56 |
57 | /*
58 | * Sends termination chunk to the rq.
59 | */
60 | void rqueue_send_term(struct buse_rqueue *rq);
61 |
62 |
63 | /*
64 | * If the read chunk is actually a termination chunk leading to device shutdown.
65 | */
66 | bool is_rqueue_term(struct read_chunk *ch);
67 |
68 | #endif
69 |
--------------------------------------------------------------------------------
/kernel/buse-wqueue.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include "buse-blkdev.h"
13 | #include "buse-chrdev.h"
14 | #include "buse-rqueue.h"
15 | #include "buse-wqueue.h"
16 | #include "main.h"
17 |
18 | static bool valid_buse_cmd(struct buse_cmd *cmd)
19 | {
20 | return cmd->magic == BUSE_MAGIC;
21 | }
22 |
23 | /*
24 | * Finalizer for flush chunk when it is acknowledged from the user space.
25 | */
26 | static void flush_finalize(struct write_chunk *ch, struct buse_wqueue *wq, bool draining)
27 | {
28 | struct buse_cmd *cmd = ch->cmd;
29 |
30 | if (!valid_buse_cmd(cmd)) {
31 | pr_debug("Invalid flush cmd!\n");
32 | return;
33 | }
34 |
35 | mutex_lock(&wq->lock);
36 | list_del_init(&ch->list);
37 | mutex_unlock(&wq->lock);
38 |
39 | kfree(ch);
40 |
41 | if (atomic_dec_and_test(&cmd->flush.queues_pending)) {
42 | if (draining)
43 | blk_mq_end_request(cmd->rq,
44 | cmd->canceled ? BLK_STS_IOERR : BLK_STS_OK);
45 | else
46 | blk_mq_end_request(cmd->rq,
47 | cmd->canceled ? BLK_STS_AGAIN : BLK_STS_OK);
48 | }
49 | }
50 |
51 | static size_t chunk_index(struct buse_wqueue *wq, struct write_chunk *wc)
52 | {
53 | void *chunks_ = wq->chunks;
54 | void *wc_ = wc;
55 |
56 | return (wc_ - chunks_) / sizeof(*wc);
57 | }
58 |
59 | /*
60 | * Initialize write chunk structure.
61 | */
62 | static void init_write_chunk(struct buse_wqueue *wq, struct write_chunk *ch)
63 | {
64 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size;
65 | u64 i = chunk_index(wq, ch);
66 |
67 | ch->shmem_offset = i * wq->buse->write_chunk_size;
68 | ch->writelist_frontier = wq->shmem + ch->shmem_offset;
69 | ch->data_frontier = ch->writelist_frontier + max_writes;
70 | ch->num_writes = 0;
71 |
72 | INIT_LIST_HEAD(&ch->dependent_reads);
73 |
74 | mutex_lock(&wq->lock);
75 | list_add_tail(&ch->list, &wq->free_chunks);
76 | mutex_unlock(&wq->lock);
77 | }
78 |
79 | /*
80 | * Finalizer for write chunk. It initiates read on all dependent reads from the read after write
81 | * hazard check. Then it just recycle the write chunk for future usage.
82 | */
83 | static int write_finalize(struct write_chunk *ch, struct buse_wqueue *wq)
84 | {
85 | struct rq_node *rq;
86 | struct buse_cmd *cmd;
87 |
88 | mutex_lock(&wq->lock);
89 |
90 | /* Remove from fetched list */
91 | list_del_init(&ch->list);
92 |
93 | mutex_unlock(&wq->lock);
94 |
95 | while (!list_empty(&ch->dependent_reads)) {
96 | rq = list_first_entry(&ch->dependent_reads, struct rq_node, list);
97 | cmd = blk_mq_rq_to_pdu(rq->rq);
98 | if (atomic_dec_and_test(&cmd->read.write_deps) &&
99 | atomic_read(&cmd->read.queues_pending) == 0 &&
100 | atomic_cmpxchg(&cmd->read.queues_pending, 0, 1) == 0) {
101 |
102 | buse_read_plain(cmd);
103 | }
104 |
105 | list_del_init(&rq->list);
106 | kfree(rq);
107 | }
108 |
109 | init_write_chunk(wq, ch);
110 | wake_up(&wq->free_chunks_avail);
111 |
112 | return 0;
113 | }
114 |
115 | static bool is_flush_offset(u64 offset)
116 | {
117 | return offset > (1UL << 32);
118 | }
119 |
120 | bool is_flush_packet(struct write_chunk *wc)
121 | {
122 | return is_flush_offset(wc->shmem_offset);
123 | }
124 |
125 | /*
126 | * When userspace acknowledge the write chunk we perform appropriate actions based on the write
127 | * chunk type.
128 | */
129 | void ack_write_request(struct buse_wqueue *wq, u64 chunk_offset, bool draining)
130 | {
131 | if (is_flush_offset(chunk_offset))
132 | flush_finalize((struct write_chunk *)chunk_offset, wq, draining);
133 | else {
134 | struct write_chunk *ch;
135 | u64 chunk_index = chunk_offset / wq->buse->write_chunk_size;
136 | uint chunks_total = wq->buse->write_shm_size / wq->buse->write_chunk_size;
137 |
138 | if (chunk_offset % wq->buse->write_chunk_size ||
139 | chunk_index >= chunks_total) {
140 | BUG();
141 | }
142 |
143 | ch = &wq->chunks[chunk_index];
144 | write_finalize(ch, wq);
145 | }
146 | }
147 |
148 | /*
149 | * Pulls write chunk from the busy queue and returns it. If there is no write chunk in the busy queue,
150 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the
151 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and
152 | * potential rerun fetched but not yet acknowledged chunks.
153 | */
154 | struct write_chunk *pop_write_request_wait(struct buse_wqueue *wq)
155 | {
156 | struct write_chunk *ch = NULL;
157 | int ret;
158 |
159 | ret = wait_event_interruptible(wq->busy_chunks_avail, !list_empty(&wq->busy_chunks));
160 | if (ret < 0)
161 | return ERR_PTR(-EAGAIN);
162 |
163 | mutex_lock(&wq->lock);
164 |
165 | BUG_ON(list_empty(&wq->busy_chunks));
166 |
167 | ch = list_first_entry(&wq->busy_chunks, struct write_chunk, list);
168 | list_del_init(&ch->list);
169 |
170 | if (!is_wqueue_term(ch))
171 | list_add_tail(&ch->list, &wq->fetched_chunks);
172 |
173 | mutex_unlock(&wq->lock);
174 |
175 | return ch;
176 | }
177 |
178 | /*
179 | * Closes active chunk of the queue, i.e. no more writes can be written to the chunk and a new
180 | * chunks has to be opened. This usually means that flush happened or the chunk is full.
181 | */
182 | int close_chunk(struct buse_wqueue *wq)
183 | {
184 | struct write_chunk *ch = wq->active_chunk;
185 |
186 | if (!ch || !ch->num_writes)
187 | goto end;
188 |
189 | list_add_tail(&ch->list, &wq->busy_chunks);
190 | wq->active_chunk = NULL;
191 |
192 | wake_up(&wq->busy_chunks_avail);
193 |
194 | end:
195 | return 0;
196 | }
197 |
198 | /*
199 | * Opens new active chunk if there is any free chunk.
200 | */
201 | int open_chunk(struct buse_wqueue *wq)
202 | {
203 | BUG_ON(wq->active_chunk);
204 |
205 | if (list_empty(&wq->free_chunks))
206 | return -EFAULT;
207 |
208 | wq->active_chunk = list_first_entry(&wq->free_chunks, struct write_chunk, list);
209 | list_del_init(&wq->active_chunk->list);
210 |
211 | return 0;
212 | }
213 |
214 | /*
215 | * Returns amount of free bytes in the chunk.
216 | */
217 | static size_t chunk_free_bytes(struct buse_wqueue *wq, struct write_chunk *ch)
218 | {
219 | void *end = wq->shmem + ch->shmem_offset + wq->buse->write_chunk_size;
220 | return end - ch->data_frontier;
221 | }
222 |
223 | /*
224 | * Splits long writes to multiple writes not crossing the collision areas boundary and adds the
225 | * sequential number to each write.
226 | */
227 | static void divide_add_collision(struct buse_cmd *cmd, struct write_chunk *ch)
228 | {
229 | struct buse* buse = cmd->queue->w.buse;
230 | size_t offset = blk_rq_pos(cmd->rq) * SECTOR_SIZE;
231 | s64 size = blk_rq_bytes(cmd->rq);
232 | size_t col_size = buse->collision_area_size;
233 | struct writelist_item write;
234 | size_t flag = req_op(cmd->rq);
235 |
236 | size_t new_size = round_up(offset+1, col_size) - offset;
237 | size_t col_id = offset / col_size;
238 | u64 id = atomic_add_return(1, &buse->collision_counters[col_id]);
239 | if (new_size > size)
240 | new_size = size;
241 |
242 |
243 | write.sector = offset / SECTOR_SIZE;
244 | write.len = new_size / SECTOR_SIZE;
245 | write.id = id;
246 | write.flag = flag;
247 | memcpy(ch->writelist_frontier, &write, sizeof(write));
248 | ch->writelist_frontier++;
249 | ch->num_writes++;
250 |
251 | offset += new_size;
252 | size -= new_size;
253 |
254 | for (; size > 0; size -= col_size, offset += col_size) {
255 | size_t col_id = offset / col_size;
256 | u64 id = atomic_add_return(1, &buse->collision_counters[col_id]);
257 | write.sector = offset / SECTOR_SIZE;
258 | write.len = col_size / SECTOR_SIZE;
259 | if (size < col_size)
260 | write.len = size / SECTOR_SIZE;
261 | write.id = id;
262 | write.flag = flag;
263 | memcpy(ch->writelist_frontier, &write, sizeof(write));
264 | ch->writelist_frontier++;
265 | ch->num_writes++;
266 | }
267 | }
268 |
269 | /*
270 | * Copy data to the shared memory from the memory specified by the io request.
271 | */
272 | static void copy_to_chunk(struct buse_cmd *cmd, struct write_chunk *ch)
273 | {
274 | char *src;
275 | size_t len;
276 | struct bio_vec bvec;
277 | struct req_iterator iter;
278 | struct request *rq = cmd->rq;
279 |
280 | divide_add_collision(cmd, ch);
281 |
282 | if (req_op(rq) == REQ_OP_WRITE) {
283 | rq_for_each_segment(bvec, rq, iter) {
284 | len = bvec.bv_len;
285 | src = kmap_atomic(bvec.bv_page);
286 | memcpy(ch->data_frontier, src + bvec.bv_offset, len);
287 | kunmap_atomic(src);
288 | ch->data_frontier += len;
289 | }
290 | }
291 | }
292 |
293 | /*
294 | * Compute number of needed slots in the metadata area of the write chunk since the write can be
295 | * split into multiple writes.
296 | */
297 | static size_t needed_slots(struct buse_cmd *cmd)
298 | {
299 | size_t size = blk_rq_bytes(cmd->rq);
300 | struct buse *buse = cmd->queue->w.buse;
301 |
302 | /* Upper bound of the crossing areas. */
303 | return size / buse->collision_area_size + 2;
304 | }
305 |
306 | /*
307 | * Number of free write metadata slots in the chunk.
308 | */
309 | static size_t chunk_free_slots(struct buse_wqueue * wq, struct write_chunk *ch)
310 | {
311 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size;
312 | return max_writes - ch->num_writes;
313 | }
314 |
315 | /*
316 | * True if the chunk is termination chunk.
317 | */
318 | bool is_wqueue_term(struct write_chunk *ch)
319 | {
320 | return ch->shmem_offset == -1;
321 | }
322 |
323 | /*
324 | * Sends termination chunk to the write queue.
325 | */
326 | void wqueue_send_term(struct buse_wqueue *wq)
327 | {
328 | struct write_chunk *fake_chunk;
329 |
330 | again:
331 | mutex_lock(&wq->lock);
332 |
333 | if (!list_empty(&wq->busy_chunks)) {
334 | mutex_unlock(&wq->lock);
335 | wait_event_interruptible(wq->free_chunks_avail, list_empty(&wq->busy_chunks));
336 | goto again;
337 | }
338 |
339 | if (wq_has_sleeper(&wq->free_chunks_avail)) {
340 | wake_up(&wq->free_chunks_avail);
341 | mutex_unlock(&wq->lock);
342 | goto again;
343 | }
344 |
345 | fake_chunk = kzalloc(sizeof(*fake_chunk), GFP_KERNEL);
346 | if (!fake_chunk) {
347 | pr_debug("Cannot allocate for term uspace_packet!\n");
348 | return;
349 | }
350 |
351 | fake_chunk->shmem_offset = (u64)-1;
352 |
353 | close_chunk(wq);
354 | list_add_tail(&fake_chunk->list, &wq->busy_chunks);
355 | wq->terminated = true;
356 | wake_up(&wq->busy_chunks_avail);
357 |
358 | mutex_unlock(&wq->lock);
359 | }
360 |
361 | /*
362 | * Copies data to the active chunk and immediately acknowledge the write request.
363 | */
364 | blk_status_t buse_write(struct buse_cmd *cmd)
365 | {
366 | struct buse_queue *q = cmd->queue;
367 | struct buse_wqueue *wq = &q->w;
368 | struct request *rq = cmd->rq;
369 | size_t max_writes = wq->buse->write_chunk_size / wq->buse->block_size;
370 |
371 | if (req_op(rq) == REQ_OP_WRITE)
372 | BUG_ON(blk_rq_bytes(rq) > wq->buse->write_chunk_size - max_writes * sizeof(struct writelist_item));
373 |
374 | again:
375 | if (cmd->canceled)
376 | return BLK_STS_IOERR;
377 |
378 | mutex_lock(&wq->lock);
379 |
380 | if (wq->terminated) {
381 | mutex_unlock(&wq->lock);
382 | return BLK_STS_IOERR;
383 | }
384 |
385 | if (wq->active_chunk &&
386 | (chunk_free_bytes(wq, wq->active_chunk) < blk_rq_bytes(rq) ||
387 | chunk_free_slots(wq, wq->active_chunk) < needed_slots(cmd)))
388 | close_chunk(wq);
389 |
390 | if (!wq->active_chunk && open_chunk(wq) < 0) {
391 | mutex_unlock(&wq->lock);
392 | wait_event_interruptible(wq->free_chunks_avail, !list_empty(&wq->free_chunks));
393 | goto again;
394 | }
395 |
396 | blk_mq_start_request(rq);
397 | BUG_ON(wq->active_chunk->num_writes > max_writes);
398 | copy_to_chunk(cmd, wq->active_chunk);
399 | mutex_unlock(&wq->lock);
400 |
401 | blk_mq_end_request(rq, BLK_STS_OK);
402 |
403 | return BLK_STS_OK;
404 | }
405 |
406 | /*
407 | * Send flush chunk to the queue.
408 | */
409 | static int send_flush(struct buse_wqueue* wq, struct buse_cmd *cmd)
410 | {
411 | struct write_chunk *fake_chunk = kmalloc(sizeof(*fake_chunk), GFP_KERNEL);
412 | if (!fake_chunk) {
413 | pr_debug("Cannot allocate for flush uspace_packet!\n");
414 | return -1;
415 | }
416 |
417 | fake_chunk->shmem_offset = (u64)fake_chunk;
418 | fake_chunk->num_writes = (u64)fake_chunk;
419 | fake_chunk->cmd = cmd;
420 |
421 | list_add_tail(&fake_chunk->list, &wq->busy_chunks);
422 | wake_up(&wq->busy_chunks_avail);
423 |
424 | return 0;
425 | }
426 |
427 | /*
428 | * Per queue flush operation. Closes active chunk and immediately after it sends the flush chunk.
429 | */
430 | static int wqueue_flush(void *data)
431 | {
432 | struct cmd_q_args *args = data;
433 | struct buse_wqueue *wq = &args->q->w;
434 | struct buse_cmd *cmd = args->cmd;
435 |
436 | mutex_lock(&wq->lock);
437 | close_chunk(wq);
438 | if (send_flush(wq, args->cmd) == -1) {
439 | pr_debug("Cannot send flush packet from flusher!\n");
440 | cmd->canceled = true;
441 | if (atomic_dec_and_test(&cmd->flush.queues_pending)) {
442 | blk_mq_start_request(cmd->rq);
443 | blk_mq_end_request(cmd->rq, BLK_STS_AGAIN);
444 | }
445 | }
446 | mutex_unlock(&wq->lock);
447 |
448 | kfree(data);
449 | do_exit(0);
450 | }
451 |
452 | /*
453 | * Flush operation. It broadcasts flush to all queues.
454 | */
455 | blk_status_t buse_flush(struct buse_cmd *cmd)
456 | {
457 | int i;
458 | struct cmd_q_args *args;
459 | struct buse_queue *q = cmd->queue;
460 | struct buse *buse = q->w.buse;
461 | size_t num_queues = buse->num_queues;
462 |
463 | atomic_set(&cmd->flush.queues_pending, num_queues);
464 |
465 | for (i = 0; i < num_queues; i++) {
466 | args = kzalloc(sizeof(*args), GFP_KERNEL);
467 | if (!args) {
468 | pr_debug("Cannot allocate!\n");
469 | goto err;
470 | }
471 |
472 | args->cmd = cmd;
473 | args->q = &buse->queues[i];
474 |
475 | if (kthread_run(wqueue_flush, args, "buse-flush%d", i) < 0) {
476 | pr_alert("Cannot spawn wqueue_flush thread!\n");
477 | goto err_args;
478 | }
479 | }
480 |
481 | return BLK_STS_OK;
482 |
483 | err_args:
484 | kfree(args);
485 | err:
486 | atomic_sub(num_queues - i, &cmd->flush.queues_pending);
487 | cmd->canceled = true;
488 |
489 | if (!i)
490 | return BLK_STS_AGAIN;
491 |
492 | return BLK_STS_OK;
493 | }
494 |
495 | /*
496 | * Another implementation of the flush logic. This one does flush broadcasting sequentially without
497 | * spawning additional threads. Kept here for potentional architecture change in the future.
498 | */
499 | /*
500 | * blk_status_t buse_flush(struct buse_cmd *cmd)
501 | * {
502 | * int i;
503 | * struct buse_queue *q = cmd->queue;
504 | * struct buse *buse = q->w.buse;
505 | * size_t num_queues = buse->num_queues;
506 | * struct buse_wqueue *wq;
507 | * size_t collision_areas = buse->size / buse->collision_area_size;
508 | *
509 | * atomic_set(&cmd->flush.queues_pending, num_queues);
510 | *
511 | * for (i = 0; i < num_queues; i++) {
512 | * wq = &buse->queues[i].w;
513 | * mutex_lock(&wq->lock);
514 | * }
515 | *
516 | * for (i = 0; i < num_queues; i++) {
517 | * wq = &buse->queues[i].w;
518 | * close_chunk(wq);
519 | * if (send_flush(wq, cmd) == -1) {
520 | * pr_debug("Cannot send flush packet from flusher!\n");
521 | * cmd->canceled = true;
522 | * if (atomic_dec_and_test(&cmd->flush.queues_pending)) {
523 | * blk_mq_start_request(cmd->rq);
524 | * blk_mq_end_request(cmd->rq, BLK_STS_AGAIN);
525 | * }
526 | * break;
527 | * }
528 | * }
529 | *
530 | * memset(wq->buse->collision_counters, 0, collision_areas);
531 | *
532 | * for (i = 0; i < num_queues; i++) {
533 | * wq = &buse->queues[i].w;
534 | * mutex_unlock(&wq->lock);
535 | * }
536 | *
537 | * return BLK_STS_OK;
538 | * }
539 | */
540 |
541 | /*
542 | * Drains all the queues because the is shutting down non-gracefully and we don't want memory leaks.
543 | */
544 | static void wqueue_drain(struct buse_wqueue *wq)
545 | {
546 | struct write_chunk *chunk;
547 |
548 | mutex_lock(&wq->lock);
549 | close_chunk(wq);
550 | while (!list_empty(&wq->busy_chunks)) {
551 | chunk = list_first_entry(&wq->busy_chunks, struct write_chunk, list);
552 | mutex_unlock(&wq->lock);
553 | if (is_wqueue_term(chunk)) {
554 | mutex_lock(&wq->lock);
555 | list_del_init(&chunk->list);
556 | mutex_unlock(&wq->lock);
557 | kfree(chunk);
558 | } else
559 | ack_write_request(wq, chunk->shmem_offset, true);
560 | mutex_lock(&wq->lock);
561 | }
562 |
563 | while (!list_empty(&wq->fetched_chunks)) {
564 | chunk = list_first_entry(&wq->fetched_chunks, struct write_chunk, list);
565 | mutex_unlock(&wq->lock);
566 | ack_write_request(wq, chunk->shmem_offset, true);
567 | mutex_lock(&wq->lock);
568 | }
569 | mutex_unlock(&wq->lock);
570 | }
571 |
572 | /*
573 | * Deallocates the write queue.
574 | */
575 | static void wqueue_exit(struct buse_wqueue *wq)
576 | {
577 | wqueue_drain(wq);
578 | kfree(wq->chunks);
579 | vfree(wq->shmem);
580 | }
581 |
582 | /*
583 | * Allocates the write queue.
584 | */
585 | static int wqueue_init(struct buse_wqueue *wq)
586 | {
587 | int ret, i;
588 | struct buse *buse = wq->buse;
589 | uint w_chunks = buse->write_shm_size / buse->write_chunk_size;
590 | int numa_node = buse_get_numa_node_for_queue_id(wq->buse, wq->q->id);
591 |
592 | init_waitqueue_head(&wq->busy_chunks_avail);
593 | init_waitqueue_head(&wq->free_chunks_avail);
594 | INIT_LIST_HEAD(&wq->free_chunks);
595 | INIT_LIST_HEAD(&wq->busy_chunks);
596 | INIT_LIST_HEAD(&wq->fetched_chunks);
597 |
598 | mutex_init(&wq->lock);
599 |
600 | wq->size = buse->write_shm_size;
601 |
602 | wq->shmem = vmalloc_node(wq->size, numa_node);
603 | if (wq->shmem == NULL) {
604 | ret = -ENOMEM;
605 | goto err;
606 | }
607 |
608 | wq->chunks = kcalloc_node(w_chunks, sizeof(*wq->chunks), GFP_KERNEL, numa_node);
609 | if (!wq->chunks) {
610 | ret = -ENOMEM;
611 | goto err_shmem;
612 | }
613 |
614 | for (i = 0; i < w_chunks; i++)
615 | init_write_chunk(wq, &wq->chunks[i]);
616 |
617 | open_chunk(wq);
618 |
619 | return 0;
620 |
621 | err_shmem:
622 | vfree(wq->shmem);
623 | err:
624 | return ret;
625 | }
626 |
627 | /*
628 | * Init all write queues.
629 | */
630 | int buse_wqueues_init(struct buse *buse)
631 | {
632 | int ret, i;
633 | struct buse_queue *q;
634 | size_t collisions_areas = buse->size / buse->collision_area_size;
635 |
636 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++) {
637 | q->w.buse = buse;
638 | q->w.q = q;
639 | ret = wqueue_init(&q->w);
640 | if (ret) {
641 | i++;
642 | q++;
643 | goto err;
644 | }
645 | }
646 |
647 | buse->collision_counters = kcalloc(collisions_areas, sizeof(*buse->collision_counters), GFP_KERNEL);
648 | if (!buse->collision_counters) {
649 | ret = -ENOMEM;
650 | goto err;
651 | }
652 |
653 | return 0;
654 |
655 | err:
656 | for (i--, q--; i > 0; i--, q--)
657 | wqueue_exit(&q->w);
658 |
659 | return ret;
660 | }
661 |
662 | /*
663 | * Deinit all write queues.
664 | */
665 | int buse_wqueues_exit(struct buse *buse)
666 | {
667 | int i;
668 | struct buse_queue *q;
669 |
670 | for (i = 0, q = buse->queues; i < buse->num_queues; i++, q++)
671 | wqueue_exit(&q->w);
672 |
673 | kfree(buse->collision_counters);
674 |
675 | return 0;
676 | }
677 |
678 | /*
679 | * Rerun all fetched chunks by the user space again. This is called when user space failes without
680 | * acknowledging write chunks and reconnects again.
681 | */
682 | static void rerun_write_chunks(struct buse_wqueue *wq)
683 | {
684 | struct write_chunk *ch;
685 |
686 | mutex_lock(&wq->lock);
687 | while (!list_empty(&wq->fetched_chunks)) {
688 | ch = list_last_entry(&wq->fetched_chunks, struct write_chunk, list);
689 | list_del_init(&ch->list);
690 | list_add(&ch->list, &wq->busy_chunks);
691 | }
692 | wake_up(&wq->busy_chunks_avail);
693 | mutex_unlock(&wq->lock);
694 | }
695 |
696 | /*
697 | * Set the queue to be bound.
698 | */
699 | void buse_wqueue_bind(struct buse_wqueue *wq)
700 | {
701 | atomic_set(&wq->bound, 1);
702 | buse_blkdev_init_cond(wq->buse);
703 | rerun_write_chunks(wq);
704 | }
705 |
706 | /*
707 | * Returns true if all write queues are bound. I.e. have connected the userspace counterpart.
708 | */
709 | bool buse_wqueues_bound(struct buse *buse)
710 | {
711 | int i;
712 | struct buse_wqueue *wq;
713 |
714 | for (i = 0; i < buse->num_queues; i++) {
715 | wq = &buse->queues[i].w;
716 | if (atomic_read(&wq->bound) == 0)
717 | return false;
718 | }
719 |
720 | return true;
721 | }
722 |
--------------------------------------------------------------------------------
/kernel/buse-wqueue.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_WQUEUE_H
4 | #define BUSE_WQUEUE_H
5 |
6 | #include
7 | #include "main.h"
8 |
9 | /*
10 | * When userspace acknowledge the write chunk we perform appropriate actions based on the write
11 | * chunk type.
12 | */
13 | void ack_write_request(struct buse_wqueue *wq, u64 chunk_offset, bool draining);
14 |
15 | /*
16 | * Copies data to the active chunk and immediately acknowledge the write request.
17 | */
18 | blk_status_t buse_write(struct buse_cmd *cmd);
19 |
20 | /*
21 | * Init all write queues.
22 | */
23 | int buse_wqueues_init(struct buse *buse);
24 |
25 | /*
26 | * Deinit all write queues.
27 | */
28 | int buse_wqueues_exit(struct buse *buse);
29 |
30 | /*
31 | * Flush operation. It broadcasts flush to all queues.
32 | */
33 | blk_status_t buse_flush(struct buse_cmd *cmd);
34 |
35 | /*
36 | * Closes active chunk of the queue, i.e. no more writes can be written to the chunk and a new
37 | * chunks has to be opened. This usually means that flush happened or the chunk is full.
38 | */
39 | int close_chunk(struct buse_wqueue *wq);
40 |
41 | bool is_flush_packet(struct write_chunk *wc);
42 |
43 | /*
44 | * Pulls write chunk from the busy queue and returns it. If there is no write chunk in the busy queue,
45 | * we sleep. If the chunks is not a termination chunk, we add to the fetched list meaning that the
46 | * chunk is in userspace but not yet acknowledged. It is for the case of userspace failure and
47 | * potential rerun fetched but not yet acknowledged chunks.
48 | */
49 | struct write_chunk *pop_write_request_wait(struct buse_wqueue *wq);
50 |
51 | /*
52 | * Returns true if all write queues are bound. I.e. have connected the userspace counterpart.
53 | */
54 | bool buse_wqueues_bound(struct buse *buse);
55 |
56 | /*
57 | * Set the queue to be bound.
58 | */
59 | void buse_wqueue_bind(struct buse_wqueue *wq);
60 |
61 | /*
62 | * Sends termination chunk to the write queue.
63 | */
64 | void wqueue_send_term(struct buse_wqueue *wq);
65 |
66 | /*
67 | * True if the chunk is termination chunk.
68 | */
69 | bool is_wqueue_term(struct write_chunk *ch);
70 |
71 | #endif
72 |
--------------------------------------------------------------------------------
/kernel/main.c:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */
2 |
3 | #include
4 | #include
5 |
6 | #include "buse-blkdev.h"
7 | #include "buse-chrdev.h"
8 | #include "buse-configfs.h"
9 | #include "buse-rqueue.h"
10 | #include "buse-wqueue.h"
11 | #include "main.h"
12 |
13 | const char *buse_blkdev_name = "buse";
14 | const int buse_blkdev_max_minors = 16;
15 | int buse_blkdev_major;
16 | struct class *buse_chrdev_class;
17 |
18 | /*
19 | * Add new buse device with index and sets default parameters. All parameters can be changed via configfs.
20 | */
21 | struct buse *buse_add(uint index)
22 | {
23 | int ret;
24 |
25 | struct buse *buse = kzalloc(sizeof(*buse), GFP_KERNEL);
26 | if (!buse) {
27 | ret = -ENOMEM;
28 | goto err;
29 | }
30 |
31 | atomic_set(&buse->stopped, 1);
32 |
33 | mutex_init(&buse->configfs_mutex);
34 | buse->index = index;
35 | buse->size = SZ_1G;
36 | buse->block_size = 512;
37 | buse->io_min = buse->block_size;
38 | buse->io_opt = buse->block_size;
39 | buse->write_chunk_size = 2 * SZ_1M;
40 | buse->write_shm_size = 32 * SZ_1M;
41 | buse->read_shm_size = buse->write_shm_size;
42 | buse->queue_depth = 64;
43 | buse->no_scheduler = true;
44 | buse->can_secure_erase = false;
45 | buse->can_discard = false;
46 | buse->can_write_same = false;
47 | buse->can_write_zeroes = false;
48 | buse->hw_queues = 1;
49 | buse->collision_area_size = 4096;
50 |
51 | return buse;
52 |
53 | err:
54 | return ERR_PTR(ret);
55 | }
56 |
57 | /*
58 | * Checks whether all queues are connected and creates the block device eventually.
59 | */
60 | void buse_blkdev_init_cond(struct buse *buse)
61 | {
62 | int ret;
63 |
64 | if (!buse_wqueues_bound(buse) ||
65 | !buse_rqueues_bound(buse) ||
66 | buse->blkdev.created)
67 | return;
68 |
69 | buse->blkdev.created = true;
70 | buse_gendisk_register(buse);
71 | return;
72 |
73 | ret = buse_blkdev_init(buse);
74 | if (ret)
75 | goto err;
76 |
77 | return;
78 |
79 | err:
80 | return;
81 | }
82 |
83 | /*
84 | * Initialize all structures for created device.
85 | */
86 | int buse_on(struct buse *buse)
87 | {
88 | int ret;
89 |
90 | buse->queues = kcalloc(buse->hw_queues, sizeof(*buse->queues), GFP_KERNEL);
91 | if (!buse->queues) {
92 | ret = -ENOMEM;
93 | goto err;
94 | }
95 |
96 | ret = buse_blkdev_init(buse);
97 | if (ret)
98 | goto err_queues;
99 |
100 | ret = buse_chrdev_init(buse);
101 | if (ret)
102 | goto err_blk;
103 |
104 | ret = buse_rqueues_init(buse);
105 | if (ret)
106 | goto err_chr;
107 |
108 | ret = buse_wqueues_init(buse);
109 | if (ret)
110 | goto err_r_init;
111 |
112 | return 0;
113 |
114 | err_r_init:
115 | buse_rqueues_exit(buse);
116 | err_chr:
117 | buse_chrdev_exit(buse);
118 | err_blk:
119 | buse_blkdev_exit(buse);
120 | err_queues:
121 | kfree(buse->queues);
122 | buse->queues = NULL;
123 | err:
124 | return ret;
125 | }
126 |
127 | /*
128 | * Deletes all the structures needed by the device.
129 | */
130 | int buse_off(struct buse *buse)
131 | {
132 | if (!buse->queues)
133 | return -EINVAL;
134 |
135 | if (buse_wqueues_bound(buse) ||
136 | buse_rqueues_bound(buse))
137 | return -EBUSY;
138 |
139 | buse_wqueues_exit(buse);
140 | buse_rqueues_exit(buse);
141 | buse_chrdev_exit(buse);
142 | buse_blkdev_exit(buse);
143 | kfree(buse->queues);
144 | buse->queues = NULL;
145 |
146 | return 0;
147 | }
148 |
149 | /*
150 | * Frees the buse structure.
151 | */
152 | void buse_del(struct buse *buse)
153 | {
154 | kfree(buse);
155 | }
156 |
157 | /*
158 | * Sends the termination chunks to all queues signaling that the device is stopping. This is
159 | * reaction to writing 0 to the power configfs attribute. When the userspace disconnect all the
160 | * queues, it can call buse_off().
161 | */
162 | void buse_stop(struct buse *buse)
163 | {
164 | int i;
165 | struct buse_wqueue *wq;
166 | struct buse_rqueue *rq;
167 |
168 | if (!buse->queues)
169 | return;
170 |
171 | for (i = 0; i < buse->num_queues; i++) {
172 | wq = &buse->queues[i].w;
173 | wqueue_send_term(wq);
174 | }
175 |
176 | for (i = 0; i < buse->num_queues; i++) {
177 | rq = &buse->queues[i].r;
178 | rqueue_send_term(rq);
179 | }
180 | }
181 |
182 | /*
183 | * Kernel module init function which is ran when the module is loaded. It just registers majors for
184 | * block device and character devices and initialize configfs subsystem. All further operations are
185 | * triggered from configfs.
186 | */
187 | static int __init buse_init(void)
188 | {
189 | int ret;
190 |
191 | buse_blkdev_major = register_blkdev(0, buse_blkdev_name);
192 | if (buse_blkdev_major < 0) {
193 | ret = buse_blkdev_major;
194 | goto err;
195 | }
196 |
197 | buse_chrdev_class = class_create(THIS_MODULE, buse_blkdev_name);
198 | if (IS_ERR(buse_chrdev_class)) {
199 | ret = PTR_ERR(buse_chrdev_class);
200 | goto err_blk;
201 | }
202 |
203 | ret = buse_configfs_init();
204 | if (ret)
205 | goto err_class;
206 |
207 | return 0;
208 |
209 | err_class:
210 | class_destroy(buse_chrdev_class);
211 | err_blk:
212 | unregister_blkdev(buse_blkdev_major, buse_blkdev_name);
213 | err:
214 | return ret;
215 | }
216 |
217 | /*
218 | * Kernel module exit function. Cleanup all module related structures. Module can be unloaded only
219 | * if all devices are destroyed.
220 | */
221 | static void __exit buse_exit(void)
222 | {
223 | class_destroy(buse_chrdev_class);
224 | unregister_blkdev(buse_blkdev_major, buse_blkdev_name);
225 |
226 | buse_configfs_exit();
227 | }
228 |
229 | module_init(buse_init);
230 | module_exit(buse_exit);
231 |
232 | MODULE_LICENSE("GPL");
233 | MODULE_AUTHOR("Vojtech Aschenbrenner ");
234 | MODULE_DESCRIPTION("BUSE");
235 | MODULE_VERSION("0.0.1");
236 |
--------------------------------------------------------------------------------
/kernel/main.h:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2021-2022 Vojtech Aschenbrenner */
2 |
3 | #ifndef BUSE_MAIN_H
4 | #define BUSE_MAIN_H
5 |
6 | #include
7 | #include
8 | #include
9 |
10 | #define BUSE_MAGIC 0xB3
11 |
12 | extern const char *buse_blkdev_name;
13 | extern const int buse_blkdev_max_minors;
14 | extern int buse_blkdev_major;
15 | extern struct class *buse_chrdev_class;
16 |
17 | /*
18 | * Per block device structure containing all necessary fields for creating mq block device.
19 | */
20 | struct buse_blkdev
21 | {
22 | struct blk_mq_tag_set tag_set;
23 | struct gendisk *disk;
24 | struct request_queue *request_queue;
25 |
26 | /* Flag which is set once the device is created. This is important
27 | * because we don't create device immediately but wait until all
28 | * control queues are connected. Hence it is important to keep track of
29 | * it to know whether to destroy the block device during shut down.
30 | */
31 | bool created;
32 | };
33 |
34 | /*
35 | * Global module structure.
36 | */
37 | struct buse
38 | {
39 | /* Configfs related fields. */
40 | struct config_item item;
41 | struct mutex configfs_mutex;
42 |
43 | /* Indicator that device was stopped. All further io requests are refused. */
44 | atomic_t stopped;
45 |
46 | /* Block device related structure. */
47 | struct buse_blkdev blkdev;
48 |
49 | /* Sequential numbers for writes. We define one counter per collision
50 | * domain to avoid excessive cache coherency protocol traffic. This
51 | * creates ordering on all writes inside the collision domain which is
52 | * enough. A counter per sector would be optimal, but memory
53 | * inefficient. One counter per whole address space would be to
54 | * contended. Collision domains are a good compromise.
55 | */
56 | atomic_t *collision_counters;
57 |
58 | /* Individual queues structure related to the created character
59 | * devices. */
60 | struct buse_queue *queues;
61 | int num_queues;
62 |
63 | /* Attributes set by configfs operations. */
64 |
65 | /* Setting to 1 powers on the device and queues can be bound. */
66 | bool power;
67 |
68 | /* Index of the created block device corresponding to the created
69 | * configfs node with mkdir. */
70 | u64 index;
71 |
72 | /* Size of the device in bytes. */
73 | u64 size;
74 |
75 | /* Block size. This should be 512 or 4096. */
76 | u64 block_size;
77 |
78 | /* Minimal IO size. Has to be >= block_size and a power of 2. */
79 | u64 io_min;
80 |
81 | /* Optimal IO size. Has to be >= block_size and a power of 2. */
82 | u64 io_opt;
83 |
84 | /* Max size of one write chunk which is passed to the userspace. */
85 | u64 write_chunk_size;
86 |
87 | /* Size of the shared memory between kernel and userspace which is used
88 | * for sending write chunks to the userspace. This is per one write
89 | * queue.
90 | */
91 | u64 write_shm_size;
92 |
93 | /* Size of the shared memory between kernel and userspace which is used
94 | * for sending individual reads to the userspace. This is per one write
95 | * queue. Compared to writes reads are not batched into chunks. Each
96 | * individual read is sent to userspace.
97 | */
98 | u64 read_shm_size;
99 |
100 | /* Queue depth of the created block device. */
101 | u64 queue_depth;
102 |
103 | /* Number of hw queues block device provides. Usually number of CPUs is
104 | * the right value. */
105 | u64 hw_queues;
106 |
107 | /* Size of the area sharing the space of write sequential numbers. */
108 | u64 collision_area_size;
109 |
110 | /* Instructs blk-mq no to use scheduler on the queues. */
111 | bool no_scheduler;
112 |
113 | /* For future usage. */
114 | bool can_secure_erase;
115 | bool can_write_same;
116 | bool can_write_zeroes;
117 | bool can_discard;
118 | };
119 |
120 | /*
121 | * Per character device structure. Character device represents a queue in our model.
122 | */
123 | struct buse_chrdev
124 | {
125 | struct cdev cdev;
126 | struct device *dev;
127 | dev_t region;
128 | };
129 |
130 | /*
131 | * Read queue structure.
132 | */
133 | struct buse_rqueue
134 | {
135 | /* Pointer to the main buse structure. */
136 | struct buse *buse;
137 |
138 | /* Pointer to the corresponding struct queue */
139 | struct buse_queue *q;
140 |
141 | /* Character device corresponding to the read queue. */
142 | struct buse_chrdev chrdev;
143 |
144 | /* Shared memory area between kernel and user space. */
145 | void *shmem;
146 | size_t size;
147 |
148 | /* Flag whether individual queue is bound, i.e. the character device is
149 | * opened and mmaped. */
150 | atomic_t bound;
151 |
152 | /* Mapping from the bitmap index to the read chunk. Used when bitmap
153 | * index is acknowledged to know what read to acknowledge.
154 | */
155 | struct read_chunk **chunk_from_bitmap;
156 |
157 | /* Waitqueue on the event when no busy chunk is available, i.e. there
158 | * is nothing to send to the userspace.
159 | */
160 | wait_queue_head_t busy_chunks_avail;
161 |
162 | /* Waitqueue on the event when no free chunk is available, i.e. there
163 | * is no space to process additional reads.
164 | */
165 | wait_queue_head_t free_chunks_avail;
166 |
167 | /* Lock per the whole read queue. */
168 | struct mutex lock;
169 |
170 | /* Bitmap for keeping track of free space in shared memory. */
171 | unsigned long *free_chunks_bitmap;
172 |
173 | /* Queue with chunks ready to be sent to user space. */
174 | struct list_head busy_chunks;
175 |
176 | /* Queue with chunks already sent to user space. Important when user
177 | * space side crashes to rerun not acknowledged but fetched reads
178 | * again.
179 | */
180 | struct list_head fetched_chunks;
181 |
182 | /* If true the termination chunk was already sent to user space and no
183 | * other chunk can be processed by the other end of the queue.
184 | */
185 | bool terminated;
186 | };
187 |
188 | /*
189 | * Description of individual write in the metadata part of the chunk.
190 | */
191 | struct writelist_item
192 | {
193 | /* First written sector. */
194 | size_t sector;
195 |
196 | /* Length of the write in sectors. */
197 | size_t len;
198 |
199 | /* Sequential number of write. */
200 | size_t id;
201 |
202 | /* Reserved for future usage. */
203 | size_t flag;
204 | };
205 |
206 | /*
207 | * Write chunk is the unit sent to the user space. It containes batched writes and is split into two
208 | * parts. Metadata part containes information about the writes and data part contains their data.
209 | */
210 | struct write_chunk
211 | {
212 | /* Chunk can be part of list. */
213 | struct list_head list;
214 |
215 | /* Offset to the shared memory where the chunk starts. */
216 | u64 shmem_offset;
217 |
218 | /* Number of writes batched in the chunk. */
219 | u64 num_writes;
220 |
221 | /* Helper pointer to keep track where next write of data should go. */
222 | void *data_frontier;
223 |
224 | /* Helper pointer to keep track where next write of metadata should go. */
225 | struct writelist_item *writelist_frontier;
226 |
227 | /* List of all reads waiting for any write in the chunk. These reads
228 | * are postponed and woken up when the write is acknowledged. Solution
229 | * of the read after write hazard.
230 | */
231 | struct list_head dependent_reads;
232 |
233 | /* If the chunks is flush chunk, i.e. just performing the flush
234 | * operation, we store the cmd pointer here to be able to acknowledge
235 | * it easily.
236 | */
237 | struct buse_cmd *cmd;
238 | };
239 |
240 | /*
241 | * Read chunk is the unit sent to the user space. In contrast to write chunk it has variable length
242 | * and corresponds to exactly one read request.
243 | */
244 | struct read_chunk
245 | {
246 | /* Part of the list. */
247 | struct list_head list;
248 |
249 | /* First sector of the read. */
250 | size_t sector;
251 |
252 | /* Length of the read in sectors. */
253 | size_t len;
254 |
255 | /* Offset in the shared memory where the chunk starts. */
256 | size_t shmem_offset;
257 |
258 | /* Pointer to cmd which has to acknowledged when this chunk is acknowledged. */
259 | struct buse_cmd *cmd;
260 | };
261 |
262 | /*
263 | * Write queue structure.
264 | */
265 | struct buse_wqueue
266 | {
267 | /* Pointer to the main buse structure. */
268 | struct buse *buse;
269 |
270 | /* Pointer to the corresponding struct queue */
271 | struct buse_queue *q;
272 |
273 | /* Character device corresponding to the read queue. */
274 | struct buse_chrdev chrdev;
275 |
276 | /* Shared memory area between kernel and user space. */
277 | void *shmem;
278 | size_t size;
279 |
280 | /* Flag whether individual queue is bound, i.e. the character device is
281 | * opened and mmaped.
282 | */
283 | atomic_t bound;
284 |
285 | /* Waitqueue on the event when no busy chunk is available, i.e. there
286 | * is nothing to send to the userspace.
287 | */
288 | wait_queue_head_t busy_chunks_avail;
289 |
290 | /* Waitqueue on the event when no free chunk is available, i.e. there
291 | * is no space to process additional reads.
292 | */
293 | wait_queue_head_t free_chunks_avail;
294 |
295 | /* Array of all write chunks. */
296 | struct write_chunk *chunks;
297 |
298 | /* Lock per the whole write queue. */
299 | struct mutex lock;
300 |
301 |
302 | /* Queue keeping track of free write chunks. */
303 | struct list_head free_chunks;
304 |
305 | /* Queue with chunks ready to be sent to user space. */
306 | struct list_head busy_chunks;
307 |
308 | /* Queue with chunks already sent to user space. Important when user
309 | * space side crashes to rerun not acknowledged but fetched writes
310 | * again.
311 | */
312 | struct list_head fetched_chunks;
313 |
314 | /* Currently active chunk in the individual queue. All writes are going to this chunk. */
315 | struct write_chunk *active_chunk;
316 |
317 | /* If true the termination chunk was already sent to user space and no
318 | * other chunk can be processed by the other end of the queue.
319 | */
320 | bool terminated;
321 | };
322 |
323 | /*
324 | * Putting read and write queues together and assign them id. Just for convenience and easier
325 | * debugging.
326 | */
327 | struct buse_queue
328 | {
329 | struct buse_rqueue r;
330 | struct buse_wqueue w;
331 | size_t id;
332 | };
333 |
334 | /*
335 | * Request extension to be insertable to the list.
336 | */
337 | struct rq_node
338 | {
339 | struct list_head list;
340 | struct request *rq;
341 | };
342 |
343 | /*
344 | * Custom cmd which is allocated for each cmd comming from the blk-mq queue. It contains
345 | */
346 | struct buse_cmd
347 | {
348 | /* Magic number to be more sure we read the right memory. */
349 | u8 magic;
350 |
351 | /* Corresponding request to the cmd. */
352 | struct request *rq;
353 |
354 | /* Queue where the request arrived. */
355 | struct buse_queue *queue;
356 |
357 | /* True if some operation failed and at the end the cmd should be
358 | * canceled and report it to the blk-mq.
359 | */
360 | bool canceled;
361 |
362 | /* Helper fields for different types of commands. */
363 | union {
364 | struct {
365 | /* How many more queues need to do their check for read
366 | * after write hazard.
367 | */
368 | atomic_t queues_pending;
369 |
370 | /* How many writes need to be acknowledged until the
371 | * read can be send to user space.
372 | */
373 | atomic_t write_deps;
374 | } read;
375 |
376 | struct {
377 | /* How many more queues need to send the flush chunk.
378 | * This is used when broadcasting flush command.
379 | */
380 | atomic_t queues_pending;
381 | } flush;
382 | };
383 | };
384 |
385 | /*
386 | * Helper for passing arguments when creating new thread.
387 | */
388 | struct cmd_q_args
389 | {
390 | struct buse_cmd *cmd;
391 | struct buse_queue *q;
392 | };
393 |
394 | /* Adds new buse device with given index. */
395 | struct buse *buse_add(uint index);
396 |
397 | /* Delete buse device. */
398 | void buse_del(struct buse *buse);
399 |
400 | /* Turns on buse. */
401 | int buse_on(struct buse *buse);
402 |
403 | /* Turns off buse. Cannot be started again. */
404 | int buse_off(struct buse *buse);
405 |
406 | /* Stops buse. No io requests are accepted but can be started again. */
407 | void buse_stop(struct buse *buse);
408 |
409 | /* Checks if all queues are connected and if they are it creates the block
410 | * device and is ready to serve io commands.
411 | */
412 | void buse_blkdev_init_cond(struct buse *buse);
413 |
414 | #endif
415 |
--------------------------------------------------------------------------------
/lib/go/buse/buse.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021-2022 Vojtech Aschenbrenner
2 |
3 | package buse
4 |
5 | import (
6 | "encoding/binary"
7 | "errors"
8 | "fmt"
9 | "io/ioutil"
10 | "os"
11 | "runtime"
12 | "sync"
13 | "syscall"
14 |
15 | "golang.org/x/sys/unix"
16 | )
17 |
18 | const (
19 | // Character device for buse device %d and read queue %d.
20 | buseReadPathFmt = "/dev/buse%d-r%d"
21 |
22 | // Character device for buse device %d and write queue %d.
23 | buseWritePathFmt = "/dev/buse%d-w%d"
24 |
25 | // Path to the configfs directory.
26 | configFsPath = "/sys/kernel/config/buse"
27 |
28 | // Size of write request in write queue.
29 | writeRequestSize = 16
30 |
31 | // Size of read request in read queue.
32 | readRequestSize = 24
33 | )
34 |
35 | // Provides functions which are called by buse as a reaction to the received
36 | // command.
37 | type BuseReadWriter interface {
38 | // BuseRead should read the extent starting at the given sector with
39 | // the given length. The read data should be written to the provided
40 | // sice. The chunk is guaranteed to have sufficient capacity to hold
41 | // the data.
42 | //
43 | // This method is called by the BUSE library in response to a read
44 | // request received from the kernel driver.
45 | BuseRead(sector, length int64, chunk []byte) error
46 |
47 | // BuseWrite should handle all writes stored in the given chunk. The
48 | // first argument holds the number of writes in the chunk.
49 | //
50 | // This method is called by the BUSE library in response to a write
51 | // or flush request received from the kernel driver.
52 | BuseWrite(writes int64, chunk []byte) error
53 |
54 | // BusePreRun is called immediately before the device is started.
55 | BusePreRun()
56 |
57 | // BusePostRemove is called after the device is removed.
58 | BusePostRemove()
59 | }
60 |
61 | // Options for created buse device.
62 | type Options struct {
63 | Durable bool
64 | WriteChunkSize int64
65 | BlockSize int64
66 | IOMin int64
67 | IOOpt int64
68 | Threads int
69 | Major int64
70 | WriteShmSize int64
71 | ReadShmSize int64
72 | Size int64
73 | CollisionArea int64
74 | QueueDepth int64
75 | Scheduler bool
76 | CPUsPerNode int
77 | }
78 |
79 | // Buse is a library wrapping the low level interaction with buse kernel module
80 | // and provides simple API to for creating a block device in user space.
81 | type Buse struct {
82 | ReadWriter BuseReadWriter
83 | Options Options
84 | }
85 |
86 | // Returns new instance of Buse configured with options o.
87 | func New(rw BuseReadWriter, o Options) (Buse, error) {
88 | buse := Buse{
89 | ReadWriter: rw,
90 | Options: o,
91 | }
92 |
93 | err := buse.checkOptions()
94 | if err != nil {
95 | return Buse{}, err
96 | }
97 |
98 | err = buse.configure()
99 | if err != nil {
100 | return Buse{}, err
101 | }
102 |
103 | return buse, nil
104 | }
105 |
106 | // Returns total memory presented to the system.
107 | func totalMemory() (uint64, error) {
108 | sysInfo := &syscall.Sysinfo_t{}
109 |
110 | if err := syscall.Sysinfo(sysInfo); err != nil {
111 | return 0, err
112 | }
113 |
114 | // On 32-bit architectures the result is uint, hence we need to type it
115 | // to uint64 to conform with function signature.
116 | totalMemory := uint64(sysInfo.Totalram) * uint64(sysInfo.Unit)
117 |
118 | return totalMemory, nil
119 | }
120 |
121 | // Validates passed options.
122 | func (b *Buse) checkOptions() error {
123 | o := &b.Options
124 |
125 | if o.Threads == 0 || o.Threads > runtime.NumCPU() {
126 | o.Threads = runtime.NumCPU()
127 | }
128 |
129 | if o.CPUsPerNode == 0 || o.CPUsPerNode > runtime.NumCPU() {
130 | o.CPUsPerNode = runtime.NumCPU()
131 | }
132 |
133 | if o.IOMin == 0 {
134 | o.IOMin = o.BlockSize
135 | }
136 |
137 | if o.IOOpt == 0 {
138 | o.IOOpt = o.BlockSize
139 | }
140 |
141 | totalMem, err := totalMemory()
142 | if err != nil {
143 | return errors.New("Cannot read total amount of ram!")
144 | }
145 |
146 | neededMemory := uint64(o.Threads) * uint64(o.WriteShmSize+o.ReadShmSize)
147 | if neededMemory > totalMem {
148 | return errors.New("Not enough memory!")
149 | }
150 |
151 | if o.WriteShmSize%o.WriteChunkSize != 0 {
152 | return errors.New("Write buffer size has to be a multiple of chunk size!")
153 | }
154 |
155 | if o.BlockSize != 512 && o.BlockSize != 4096 {
156 | return errors.New("Block size has to 512 or 4096!")
157 | }
158 |
159 | if o.IOMin < o.BlockSize || o.IOMin%2 != 0 {
160 | return errors.New("Minimal IO has to be at least a block size and a power of 2!")
161 | }
162 |
163 | if o.IOOpt < o.BlockSize || o.IOOpt%2 != 0 {
164 | return errors.New("Optimal IO has to be at least a block size and a power of 2!")
165 | }
166 |
167 | return nil
168 | }
169 |
170 | // Performs configuration of the block device which is just being created. It
171 | // configures buse device via configs according to the options passed to the
172 | // New() function. When configuration succeed the device is power on.
173 | func (b *Buse) configure() error {
174 | var noScheduler int64
175 | if !b.Options.Scheduler {
176 | noScheduler = 1
177 | }
178 |
179 | configFsPath := fmt.Sprint(configFsPath, "/", b.Options.Major)
180 | if _, err := os.Stat(configFsPath); !os.IsNotExist(err) {
181 | return errors.New(fmt.Sprintf("Device buse%d already exists!", b.Options.Major))
182 | }
183 |
184 | if err := os.Mkdir(configFsPath, 0755); err != nil {
185 | return err
186 | }
187 |
188 | kernelParams := map[string]int64{
189 | "size": b.Options.Size,
190 | "collision_area_size": int64(b.Options.CollisionArea),
191 | "read_shm_size": int64(b.Options.ReadShmSize),
192 | "write_shm_size": int64(b.Options.WriteShmSize),
193 | "write_chunk_size": int64(b.Options.WriteChunkSize),
194 | "hw_queues": int64(b.Options.Threads),
195 | "blocksize": int64(b.Options.BlockSize),
196 | "io_min": int64(b.Options.IOMin),
197 | "io_opt": int64(b.Options.IOOpt),
198 | "queue_depth": int64(b.Options.QueueDepth),
199 | "no_scheduler": noScheduler,
200 | }
201 |
202 | for variable, value := range kernelParams {
203 | if err := b.setConfig(variable, value); err != nil {
204 | return err
205 | }
206 | }
207 |
208 | if err := b.setConfig("power", 1); err != nil {
209 | return err
210 | }
211 |
212 | return nil
213 | }
214 |
215 | // Opens control file and mmap it. Returns file and mmapped memory.
216 | func openAndMmapControlFile(chardev string, shm_size int) (*os.File, []byte, error) {
217 | f, err := os.OpenFile(chardev, os.O_RDWR, 0644)
218 | if err != nil {
219 | return nil, nil, err
220 | }
221 |
222 | shmem, err := syscall.Mmap(int(f.Fd()), 0, shm_size,
223 | syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
224 | if err != nil {
225 | f.Close()
226 | return nil, nil, err
227 | }
228 |
229 | return f, shmem, err
230 | }
231 |
232 | // Parses request reading from write queue character device.
233 | func (b *Buse) parseWriteRequest(request []byte) ([]byte, uint64, uint64) {
234 | raw := make([]byte, 8)
235 | copy(raw, request[:8])
236 | offset := binary.LittleEndian.Uint64(raw)
237 | writesLen := binary.LittleEndian.Uint64(request[8:16])
238 |
239 | return raw, offset, writesLen
240 | }
241 |
242 | // Parses request reading from read queue character device.
243 | func (b *Buse) parseReadRequest(request []byte) ([]byte, uint64, uint64, uint64) {
244 | raw := make([]byte, 8)
245 | copy(raw, request[16:24])
246 | offset := binary.LittleEndian.Uint64(raw)
247 |
248 | sector := binary.LittleEndian.Uint64(request[:8]) * 512 / uint64(b.Options.BlockSize)
249 | length := binary.LittleEndian.Uint64(request[8:16]) * 512 / uint64(b.Options.BlockSize)
250 |
251 | return raw, offset, sector, length
252 | }
253 |
254 | // True if the request means termination of the device.
255 | func isTermination(offset uint64) bool {
256 | return offset == ^uint64(0)
257 | }
258 |
259 | // True if the request is flush.
260 | func isFlush(offset uint64) bool {
261 | return offset > (1 << 32)
262 | }
263 |
264 | func (b *Buse) bindToLocalNumaNode(cpuId int) {
265 | localNode := cpuId / b.Options.CPUsPerNode
266 | firstCpu := localNode * b.Options.CPUsPerNode
267 | lastCpu := firstCpu + b.Options.CPUsPerNode - 1
268 |
269 | cpuSet := unix.CPUSet{}
270 | cpuSet.Zero()
271 |
272 | for c := firstCpu; c <= lastCpu; c++ {
273 | cpuSet.Set(c)
274 | }
275 |
276 | unix.SchedSetaffinity(0, &cpuSet)
277 | }
278 |
279 | // Infinite loop reading from write queue character device and calling
280 | // BuseWrite() callback provided by calling application. When the BuseWrite()
281 | // returns then the batched write is confirmed to the kernel leading to the
282 | // recycling of the buffer in shared memory.
283 | func (b *Buse) writer(chardev string, wgFunc *sync.WaitGroup, shm_size int) {
284 | defer wgFunc.Done()
285 |
286 | var major, cpuId int
287 | fmt.Sscanf(chardev, buseWritePathFmt, &major, &cpuId)
288 | b.bindToLocalNumaNode(cpuId)
289 |
290 | controlFile, shmem, err := openAndMmapControlFile(chardev, shm_size)
291 | if err != nil {
292 | panic(err)
293 | }
294 | defer controlFile.Close()
295 | defer syscall.Munmap(shmem)
296 |
297 | requestBuffer := make([]byte, writeRequestSize)
298 | wg := sync.WaitGroup{}
299 | for {
300 | _, err := controlFile.Read(requestBuffer)
301 | if err != nil {
302 | continue
303 | }
304 |
305 | offsetRaw, offset, writesLen := b.parseWriteRequest(requestBuffer)
306 |
307 | if isTermination(offset) {
308 | wg.Wait()
309 | return
310 | }
311 |
312 | if isFlush(offset) {
313 | if b.Options.Durable {
314 | wg.Wait()
315 | }
316 | controlFile.Write(offsetRaw)
317 | continue
318 | }
319 |
320 | dataRegion := shmem[offset : offset+uint64(b.Options.WriteChunkSize)]
321 | wg.Add(1)
322 | go func() {
323 | defer wg.Done()
324 |
325 | err := b.ReadWriter.BuseWrite(int64(writesLen), dataRegion)
326 | if err != nil {
327 | fmt.Fprintf(os.Stderr, "Chunk write (%d writes) failed!\n", writesLen)
328 | fmt.Fprint(os.Stderr, err)
329 | }
330 |
331 | n, err := controlFile.Write(offsetRaw)
332 | if err != nil {
333 | fmt.Fprint(os.Stderr, "Read ack error, n =", n, "err=", err.Error())
334 | fmt.Fprint(os.Stderr, err)
335 | }
336 | }()
337 | }
338 | }
339 |
340 | // Infinite loop reading from read queue character device and calling
341 | // BuseRead() callback provided by calling application. When the BuseRead()
342 | // returns then the read request is acknowledged to the kernel.
343 | func (b *Buse) reader(chardev string, wgFunc *sync.WaitGroup, shm_size int) {
344 | defer wgFunc.Done()
345 |
346 | var major, cpuId int
347 | fmt.Sscanf(chardev, buseReadPathFmt, &major, &cpuId)
348 | b.bindToLocalNumaNode(cpuId)
349 |
350 | controlFile, shmem, err := openAndMmapControlFile(chardev, shm_size)
351 | if err != nil {
352 | panic(err)
353 | }
354 | defer controlFile.Close()
355 | defer syscall.Munmap(shmem)
356 |
357 | requestBuffer := make([]byte, readRequestSize)
358 | var wg sync.WaitGroup
359 | for {
360 | _, err := controlFile.Read(requestBuffer)
361 | if err != nil {
362 | continue
363 | }
364 |
365 | offsetRaw, offset, sector, length := b.parseReadRequest(requestBuffer)
366 |
367 | if isTermination(offset) {
368 | wg.Wait()
369 | return
370 | }
371 |
372 | size := int64(length) * b.Options.BlockSize
373 | dataRegion := shmem[int64(offset) : int64(offset)+size]
374 |
375 | wg.Add(1)
376 | go func() {
377 | defer wg.Done()
378 |
379 | err := b.ReadWriter.BuseRead(int64(sector), int64(length), dataRegion)
380 | if err != nil {
381 | fmt.Fprint(os.Stderr, err)
382 | }
383 |
384 | _, err = controlFile.Write(offsetRaw)
385 | if err != nil {
386 | fmt.Fprint(os.Stderr, err)
387 | }
388 | }()
389 | }
390 | }
391 |
392 | // Bind all the control queues and start processing read and write commands.
393 | // This is done via multiple readers and writers. One worker per queue.
394 | func (b *Buse) Run() {
395 | b.ReadWriter.BusePreRun()
396 |
397 | var wg sync.WaitGroup
398 | wg.Add(int(b.Options.Threads) * 2)
399 | for i := 0; i < int(b.Options.Threads); i++ {
400 | w := fmt.Sprintf(buseWritePathFmt, b.Options.Major, i)
401 | r := fmt.Sprintf(buseReadPathFmt, b.Options.Major, i)
402 |
403 | go b.writer(w, &wg, int(b.Options.WriteShmSize))
404 | go b.reader(r, &wg, int(b.Options.ReadShmSize))
405 | }
406 | wg.Wait()
407 | }
408 |
409 | // Write value to configfs variable.
410 | func (b *Buse) setConfig(variable string, value int64) error {
411 | configFsPath := fmt.Sprint(configFsPath, "/", b.Options.Major, "/", variable)
412 | byteValue := []byte(fmt.Sprint(value))
413 |
414 | err := ioutil.WriteFile(configFsPath, byteValue, 0644)
415 |
416 | return err
417 | }
418 |
419 | // Stop buse device. All requests are refused but the device is still visible
420 | // and can be started again.
421 | func (b *Buse) StopDevice() error {
422 | err := b.setConfig("power", 0)
423 | return err
424 | }
425 |
426 | // Remove the device. The device is unregistered as block device.
427 | func (b *Buse) RemoveDevice() error {
428 | err := syscall.Rmdir(fmt.Sprint(configFsPath, "/", b.Options.Major))
429 | b.ReadWriter.BusePostRemove()
430 | return err
431 | }
432 |
--------------------------------------------------------------------------------
/lib/go/buse/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/asch/buse/lib/go/buse
2 |
3 | go 1.16
4 |
--------------------------------------------------------------------------------