├── src ├── .gitignore ├── dkms.conf ├── Makefile ├── dm-writeboost-daemon.h ├── dm-writeboost-metadata.h ├── dm-writeboost.h ├── dm-writeboost-daemon.c ├── dm-writeboost-metadata.c └── dm-writeboost-target.c ├── Makefile ├── .github └── workflows │ └── ci.yml ├── README.md ├── ChangeLog ├── doc └── dm-writeboost-readme.txt └── LICENSE /src/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !dm-writeboost.h 4 | !dm-writeboost-target.[c] 5 | !dm-writeboost-metadata.[ch] 6 | !dm-writeboost-daemon.[ch] 7 | !dkms.conf 8 | !Makefile -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MODULE_VERSION ?= 2.2.19 2 | DKMS_DIR := /usr/src/dm-writeboost-$(MODULE_VERSION) 3 | DKMS_KEY := -m dm-writeboost -v $(MODULE_VERSION) 4 | 5 | install: 6 | cp -r src $(DKMS_DIR) 7 | dkms add $(DKMS_KEY) 8 | dkms build $(DKMS_KEY) 9 | dkms install $(DKMS_KEY) 10 | 11 | uninstall: 12 | dkms remove --all $(DKMS_KEY) 13 | rm -rf $(DKMS_DIR) 14 | -------------------------------------------------------------------------------- /src/dkms.conf: -------------------------------------------------------------------------------- 1 | PACKAGE_NAME="dm-writeboost" 2 | PACKAGE_VERSION="2.2.19" 3 | 4 | # dm-writeboost builds on top of dm features introduced in Linux 3.9 5 | BUILD_EXCLUSIVE_KERNEL_MIN="3.9" 6 | 7 | BUILT_MODULE_NAME="dm-writeboost" 8 | DEST_MODULE_LOCATION="/kernel/drivers/md" 9 | MAKE="make all KERNEL_TREE=$kernel_source_dir" 10 | CLEAN="make clean" 11 | AUTOINSTALL="yes" 12 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | KERNEL_SOURCE_VERSION ?= $(shell uname -r) 2 | KERNEL_TREE ?= /lib/modules/$(KERNEL_SOURCE_VERSION)/build 3 | 4 | obj-m := dm-writeboost.o 5 | dm-writeboost-objs := \ 6 | dm-writeboost-target.o \ 7 | dm-writeboost-metadata.o \ 8 | dm-writeboost-daemon.o 9 | 10 | all: 11 | $(MAKE) -C $(KERNEL_TREE) M=$(PWD) modules 12 | 13 | clean: 14 | $(MAKE) -C $(KERNEL_TREE) M=$(PWD) clean 15 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | push: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | run_tests: 13 | name: Tests 14 | runs-on: ubuntu-24.04 15 | steps: 16 | - name: Kernel version 17 | run: uname -a 18 | 19 | - name: Install dependencies 20 | uses: awalsh128/cache-apt-pkgs-action@latest 21 | with: 22 | packages: cargo dkms cryptsetup xfsprogs dbench stress 23 | version: 1.0 24 | 25 | - name: Check cryptsetup enabled 26 | run: cryptsetup benchmark -c aes-xts-plain64 -s 512 27 | 28 | - name: Checkout device-mapper-tests 29 | uses: actions/checkout@v4 30 | with: 31 | repository: akiradeveloper/device-mapper-tests 32 | ref: master 33 | path: tests 34 | 35 | - name: Checkout dm-writeboost 36 | uses: actions/checkout@v4 37 | with: 38 | repository: akiradeveloper/dm-writeboost 39 | path: module 40 | 41 | - name: Install dm-writeboost target 42 | working-directory: module 43 | run: | 44 | sudo mkdir -p /var/lib/dkms 45 | sudo make install 46 | 47 | - name: Load dm-writeboost 48 | run: sudo modprobe dm-writeboost 49 | 50 | - name: Checkout dm-writeboost-tools 51 | uses: actions/checkout@v4 52 | with: 53 | repository: akiradeveloper/dm-writeboost-tools 54 | ref: master 55 | path: tools 56 | 57 | - name: Install dm-writeboost-tools 58 | working-directory: tools 59 | run: sudo cargo install --path . --root /usr/local 60 | 61 | - name: Test (wb-command) 62 | working-directory: tests/wb-command-tests 63 | run: sudo make test 64 | 65 | - name: Test (writeboost) 66 | working-directory: tests/writeboost-tests 67 | run: sudo make test -------------------------------------------------------------------------------- /src/dm-writeboost-daemon.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of dm-writeboost 3 | * Copyright (C) 2012-2025 Akira Hayakawa 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program; if not, write to the Free Software Foundation, Inc., 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | */ 19 | 20 | #ifndef DM_WRITEBOOST_DAEMON_H 21 | #define DM_WRITEBOOST_DAEMON_H 22 | 23 | /*----------------------------------------------------------------------------*/ 24 | 25 | int flush_daemon_proc(void *); 26 | void wait_for_flushing(struct wb_device *, u64 id); 27 | 28 | /*----------------------------------------------------------------------------*/ 29 | 30 | void queue_barrier_io(struct wb_device *, struct bio *); 31 | void flush_barrier_ios(struct work_struct *); 32 | 33 | /*----------------------------------------------------------------------------*/ 34 | 35 | void update_nr_empty_segs(struct wb_device *); 36 | int writeback_daemon_proc(void *); 37 | void wait_for_writeback(struct wb_device *, u64 id); 38 | void mark_clean_seg(struct wb_device *, struct segment_header *seg); 39 | 40 | /*----------------------------------------------------------------------------*/ 41 | 42 | int writeback_modulator_proc(void *); 43 | 44 | /*----------------------------------------------------------------------------*/ 45 | 46 | int data_synchronizer_proc(void *); 47 | 48 | /*----------------------------------------------------------------------------*/ 49 | 50 | int sb_record_updater_proc(void *); 51 | 52 | /*----------------------------------------------------------------------------*/ 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /src/dm-writeboost-metadata.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of dm-writeboost 3 | * Copyright (C) 2012-2025 Akira Hayakawa 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program; if not, write to the Free Software Foundation, Inc., 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | */ 19 | 20 | #ifndef DM_WRITEBOOST_METADATA_H 21 | #define DM_WRITEBOOST_METADATA_H 22 | 23 | /*----------------------------------------------------------------------------*/ 24 | 25 | struct segment_header * 26 | get_segment_header_by_id(struct wb_device *, u64 segment_id); 27 | struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id); 28 | sector_t calc_mb_start_sector(struct wb_device *, struct segment_header *, 29 | u32 mb_idx); 30 | u8 mb_idx_inseg(struct wb_device *, u32 mb_idx); 31 | struct segment_header *mb_to_seg(struct wb_device *, struct metablock *); 32 | bool is_on_buffer(struct wb_device *, u32 mb_idx); 33 | 34 | /*----------------------------------------------------------------------------*/ 35 | 36 | struct lookup_key { 37 | sector_t sector; 38 | }; 39 | 40 | struct ht_head; 41 | struct ht_head *ht_get_head(struct wb_device *, struct lookup_key *); 42 | struct metablock *ht_lookup(struct wb_device *, 43 | struct ht_head *, struct lookup_key *); 44 | void ht_register(struct wb_device *, struct ht_head *, 45 | struct metablock *, struct lookup_key *); 46 | void ht_del(struct wb_device *, struct metablock *); 47 | void discard_caches_inseg(struct wb_device *, struct segment_header *); 48 | 49 | /*----------------------------------------------------------------------------*/ 50 | 51 | void prepare_segment_header_device(void *rambuffer, struct wb_device *, 52 | struct segment_header *src); 53 | u32 calc_checksum(void *rambuffer, u8 length); 54 | 55 | /*----------------------------------------------------------------------------*/ 56 | 57 | int try_alloc_writeback_ios(struct wb_device *, size_t nr_batch, gfp_t gfp); 58 | 59 | /*----------------------------------------------------------------------------*/ 60 | 61 | int resume_cache(struct wb_device *); 62 | void free_cache(struct wb_device *); 63 | 64 | /*----------------------------------------------------------------------------*/ 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dm-writeboost 2 | 3 | ![CI](https://github.com/akiradeveloper/dm-writeboost/actions/workflows/ci.yml/badge.svg) 4 | 5 | Log-structured Caching for Linux 6 | 7 | ## Overview 8 | dm-writeboost is originated from [Disk Caching Disk (DCD)](http://www.ele.uri.edu/research/hpcl/DCD/DCD.html). 9 | DCD, implemented in Solaris, is an OS-level IO controller that builds logs from in-coming writes 10 | (data and metadata) and then writes the logs sequentially similar to log-structured filesystem. 11 | dm-writeboost implements the concept on Linux's device-mapper in more sophisticated way. 12 | As a further extension, dm-writeboost supports read-caching which also writes data sequentially. 13 | 14 | ## Documents 15 | - [dm-writeboost-quickstart](https://docs.google.com/presentation/d/1v-L8Ma138o7jNBFqRl0epyc1Lji3XhUH1RGj8p7DVe8/edit?usp=sharing) 16 | - doc/dm-writeboost-readme.txt 17 | - [dm-writeboost-internal](https://docs.google.com/presentation/d/1mDh5ct3OR-eRxBbci3LQgaTvUFx9WTLw-kkBxNBeTD8/edit?usp=sharing) 18 | - [Wiki](https://github.com/akiradeveloper/dm-writeboost/wiki) 19 | 20 | ## Features 21 | * **Durable**: Any power failure can't break consistency because each log consists of data, metadata and 22 | the checksum of the log itself. 23 | * **Lifetime**: Other caching software (e.g. dm-cache) separates data and 24 | metadata and therefore submits writes to SSD too frequently. dm-writeboost, 25 | on the other hand, submits only one writes for hundreds of data and 26 | metadata updates so the SSD lives longer since SSD's lifetime depends on 27 | how many writes are submitted. 28 | * **Fast**: Since the sequential write is the best I/O pattern for every SSD and the code base is optimized for 29 | in-coming random writes, the write performance is the best among all caching drivers including dm-cache and 30 | bcache. 31 | * **Portable**: All kernel version 3.10 or later is supported with minimum compile-time macros. 32 | 33 | ## Usage 34 | - **Install**: `sudo make install` to install and `sudo make uninstall` to uninstall. 35 | `sudo make uninstall MODULE_VERSION=xxx` can uninstall specific version that's installed. 36 | DKMS is required so please install it beforehand. (usually available in package system) 37 | - **Make a device**: Make a script to build a caching device. Please read doc/dm-writeboost-readme.txt for 38 | the dmsetup command detail. 39 | After reboot, you need to rebuild the caching device rather than reformatting as in the initial setup. 40 | 41 | ## Distribution Packages 42 | - [Debian](https://packages.debian.org/search?keywords=dm-writeboost-dkms) 43 | - [Ubuntu](https://packages.ubuntu.com/search?keywords=dm-writeboost-dkms) 44 | 45 | ## Related Projects 46 | * https://github.com/akiradeveloper/dm-writeboost-tools: Tools to help users analyze the state of the cache device 47 | * https://gitlab.com/onlyjob/writeboost: A management tool including init script 48 | * https://github.com/akiradeveloper/device-mapper-tests: Testing framework written in Rust 49 | 50 | ## Related works 51 | * Y. Hu and Q. Yang -- DCD Disk Caching Disk: A New Approach for Boosting I/O Performance (1995) 52 | (http://www.ele.uri.edu/research/hpcl/DCD/DCD.html) 53 | * G. Soundararajan et. al. -- Extending SSD Lifetimes with Disk-Based Write Caches (2010) 54 | (https://www.usenix.org/conference/fast-10/extending-ssd-lifetimes-disk-based-write-caches) 55 | * Y. Oh -- SSD RAID as Cache (SRC) with Log-structured Approach for Performance and Reliability (2014) 56 | (https://ysoh.files.wordpress.com/2009/05/dm-src-ibm.pdf) 57 | 58 | ## Award 59 | Awarded by Japanese OSS Encouragement Award. Thanks! 60 | 61 | ## License 62 | ``` 63 | Copyright (C) 2012-2025 Akira Hayakawa 64 | 65 | This program is free software; you can redistribute it and/or modify 66 | it under the terms of the GNU General Public License as published by 67 | the Free Software Foundation; either version 2 of the License, or 68 | (at your option) any later version. 69 | 70 | This program is distributed in the hope that it will be useful, 71 | but WITHOUT ANY WARRANTY; without even the implied warranty of 72 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 73 | GNU General Public License for more details. 74 | 75 | You should have received a copy of the GNU General Public License along 76 | with this program; if not, write to the Free Software Foundation, Inc., 77 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 78 | ``` 79 | 80 | ## Developer Info 81 | Akira Hayakawa (@akiradeveloper) 82 | e-mail: ruby.wktk@gmail.com 83 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2025-07-10 Akira Hayakawa 2 | 3 | * v2.2.19 4 | * Fix build error with 6.12 kernel 5 | * Refactor 6 | * Update copyright year 7 | 8 | 2024-11-12 Akira Hayakawa 9 | 10 | * v2.2.18 11 | * Fix build error with 6.10 kernel 12 | 13 | 2024-05-01 Akira Hayakawa 14 | 15 | * v2.2.17 16 | * Fix build error with 6.9 kernel and backports 17 | * Improve dkms.conf 18 | 19 | 2023-02-11 Akira Hayakawa 20 | 21 | * v2.2.16 22 | * Fix build error with 6.0 kernel 23 | * Handle STATUSTYPE_IMA 24 | 25 | 2021-05-04 Akira Hayakawa 26 | 27 | * v2.2.15 28 | * Fix build error with 5.12 kernel 29 | 30 | 2021-03-21 Akira Hayakawa 31 | 32 | * v2.2.14 33 | * Fix build error with 5.11 kernel 34 | * Update copyright year 35 | 36 | 2020-11-22 Akira Hayakawa 37 | 38 | * v2.2.13 39 | * Fix build error with 5.9 kernel 40 | 41 | 2020-08-09 Akira Hayakawa 42 | 43 | * v2.2.12 44 | * Fix build error with 5.8 kernel 45 | 46 | 2020-06-05 Akira Hayakawa 47 | 48 | * v2.2.11 49 | * Fix build error with 5.7 kernel 50 | * Update copyright year 51 | 52 | 2018-11-08 Akira Hayakawa 53 | 54 | * v2.2.10 55 | * Fix build error with 4.19 kernel 56 | 57 | 2018-06-09 Akira Hayakawa 58 | 59 | * v2.2.9 60 | * Fix build error with 4.15 kernel 61 | * Fix build error with 4.17 kernel 62 | 63 | 2017-10-15 Akira Hayakawa 64 | 65 | * v2.2.8 66 | * Fix build error with 4.14 kernel 67 | * Support 4Kn devices 68 | 69 | 2017-04-13 Akira Hayakawa 70 | 71 | * v2.2.7 72 | * Fix build error with CentOS 7.3 73 | * Wake up writeback thread only when needed 74 | * Fix doc (deprecated --getsize option) 75 | 76 | 2016-09-19 Akira Hayakawa 77 | 78 | * v2.2.6 79 | * Clarify producer-consumer pattern 80 | * Fix build error with 3.10 kernel 81 | * Fix build error with 3.14 kernel 82 | 83 | 2016-09-12 Akira Hayakawa 84 | 85 | * v2.2.5 86 | * Fix read-caching data corruption issue 87 | * Insert memory barriers 88 | * Code cleanup 89 | 90 | 2016-08-28 Akira Hayakawa 91 | 92 | * v2.2.4 93 | * Fix update_sb_record_interval 94 | * Throttle writeback when there are only few empty segments in the 95 | caching device 96 | * Remove experimental from read-caching 97 | 98 | 2016-08-02 Akira Hayakawa 99 | 100 | * v2.2.3 101 | * Rename write_through_mode to write_around_mode because it's more 102 | precise 103 | * Reformat the caching device when it's write_around_mode 104 | 105 | 2016-07-30 Akira Hayakawa 106 | 107 | * v2.2.2 108 | * Use kmap_atomic() to access the bio payload 109 | * Fix doc (clear_stat) 110 | 111 | 2016-07-18 Akira Hayakawa 112 | 113 | * v2.2.1 114 | * Unsupport TRIM 115 | * Fixes (fail if partial read from caching device fails etc.) 116 | 117 | 2016-05-01 Akira Hayakawa 118 | 119 | * v2.2.0 120 | * Remove partial writeback in foreground. This results in writing 121 | back cached data strictly from the older ones, which makes cache 122 | device corruption safer 123 | * Fix build error for kernel 4.6. per_bio_data_size is renamed to 124 | per_io_data_size 125 | * Remove SECTOR_SHIFT 126 | 127 | 2016-03-05 Akira Hayakawa 128 | 129 | * v2.1.2 130 | * Remove blockup mechanism 131 | * Use vmalloc for read_cache_cell's buffer 132 | 133 | 2016-01-04 Akira Hayakawa 134 | 135 | * v2.1.1 136 | * Define bio_endio_compat 137 | * Update copyright date 138 | * Update/fix docs 139 | 140 | 2015-08-02 Akira Hayakawa 141 | 142 | * v2.1.0 143 | * Remove ACCESS_ONCE around cell->cancelled 144 | * Change the type of cell->cancelled from int to bool 145 | * Fix dmsetup table 146 | * Add write_through_mode 147 | 148 | 2015-07-28 Akira Hayakawa 149 | 150 | * v2.0.6 151 | * Use vmalloc for rambuf and writeback_segs 152 | * Fix location of might_queue_current_buffer() (this is a good 153 | refactoring too) 154 | * Fix inject_read_cache so it checks cell->cancelled inside mutex. 155 | * Fix comment (ctr) 156 | 157 | 2015-07-20 Akira Hayakawa 158 | 159 | * v2.0.5 160 | * Add __GFP_NOWARN to allocation of writeback ios 161 | * Use vmalloc for large_array struct 162 | 163 | 2015-07-15 Akira Hayakawa 164 | 165 | * v2.0.4 166 | * Fast-path for clean initialization 167 | * Restrict the nr_max_batched_writeback 168 | 169 | 2015-07-13 Akira Hayakawa 170 | 171 | * v2.0.3 172 | * Use separate wq for barrier flush 173 | 174 | 2015-07-12 Akira Hayakawa 175 | 176 | * v2.0.2 177 | * Fix the crc32c wrapper so it complements the computed value. 178 | 179 | 2015-07-09 Akira Hayakawa 180 | 181 | * v2.0.1 182 | * Fix for "mkfs.xfs -m crc=1" issue. 183 | Add copy_bio_payload(). 184 | * Fix end_io not to ignore error. 185 | * Fix bad pointer access in try_alloc_writeback_ios(). 186 | 187 | 2015-06-16 Akira Hayakawa 188 | 189 | * v2.0.0 190 | * Design change. 191 | Purge static optional args (nr_rambuf_pool, segment_size_order) 192 | so as to work well with Dmitry's tool. 193 | 194 | 2015-05-14 Akira Hayakawa 195 | 196 | * v1.0.1 197 | * Fix read-caching that didn't hit at all. 198 | 199 | 2015-05-10 Akira Hayakawa 200 | 201 | * v1.0.0 202 | -------------------------------------------------------------------------------- /doc/dm-writeboost-readme.txt: -------------------------------------------------------------------------------- 1 | dm-writeboost 2 | ============= 3 | dm-writeboost target provides block-level log-structured caching. 4 | All writes and reads are written to the caching device in sequential manner. 5 | 6 | 7 | Mechanism 8 | ========= 9 | Control three layers (RAM buffer, caching device and backing device) 10 | -------------------------------------------------------------------- 11 | dm-writeboost controls three different layers - RAM buffer (rambuf), caching 12 | device (cache_dev, e.g SSD) and backing device (backing_dev, e.g. HDD). 13 | All data are first stored in the RAM buffer and when the RAM buffer is full, 14 | dm-writeboost adds metadata block (with checksum) on the RAM buffer to create a 15 | "log". Afterward, the log is written to the caching device sequentially by a 16 | background thread and thereafter written back to the backing device in the 17 | background as well. 18 | 19 | 20 | dm-writeboost vs dm-cache or bcache 21 | =================================== 22 | How dm-writeboost differs from other existing SSD-caching drivers? 23 | 24 | The most distinctive point is that dm-writeboost writes to caching device the 25 | least frequently. Because it creates a log that's contains 127 writes before 26 | it actually writes the log to the caching device, writing to the caching device 27 | happens only once in 127 writes while other caching drivers writes more often. 28 | Since SSD's lifetime decreases as it experiences writes, users can reduce the 29 | risk of SSD disorder. 30 | 31 | dm-writeboost performs very much efficient than other caching solutions in 32 | small random pattern. But since it always split the requests into 4KB chunks, 33 | it may not be the best when the ave. I/O size is very large in your workload. 34 | However, if the splitting overhead aside, dm-writeboost is always the best of 35 | all because it caches data in sequential manner - the most efficient I/O pattern 36 | yet for the SSD caching device in terms of performance. 37 | 38 | It's known from experiments that dm-writeboost performs no good when you create 39 | a dm-writeboost'd device in virtual environment like KVM. So, keep in mind to 40 | use this driver in a physical machine. 41 | 42 | 43 | How To Use dm-writeboost 44 | ======================== 45 | Trigger caching device reformat 46 | ------------------------------- 47 | The caching device is triggered reformating only if the first one sector of the 48 | caching device is zeroed out. Note that this operation should be omitted when 49 | you resume the caching device. 50 | e.g. dd if=/dev/zero of=$CACHE oflag=direct bs=512 count=1 51 | 52 | Construct dm-writeboost'd device 53 | -------------------------------- 54 | You can construct dm-writeboost'd device with dmsetup create command. 55 | 56 | 57 | <#optional args> 58 | 59 | - <#optional args> is twice the length of the following list. 60 | - is unordered list of key-value pairs. 61 | 62 | 63 | backing_dev : A block device having original data (e.g. HDD) 64 | cache_dev : A block device having caches (e.g. SSD) 65 | 66 | 67 | see `Optional args` 68 | 69 | e.g. 70 | BACKING=/dev/sdb # example 71 | CACHE=/dev/sdc # example 72 | sz=`blockdev --getsz ${BACKING}` 73 | dmsetup create wbdev --table "0 $sz writeboost $BACKING $CACHE 2 writeback_threshold 70" 74 | 75 | Shut down the system 76 | -------------------- 77 | On shutting down the system, you don't need to do anything at all. The data 78 | and metadata is safely saved on the caching device. But, if you want to do 79 | deconstruct the device manually, use dmsetup remove. 80 | 81 | Resume after system reboot 82 | -------------------------- 83 | To resume your caching device of the on-disk state, run dmsetup create command 84 | with the same parameter but DO NOT zero out the first sector of the caching device. 85 | This replays the logs on the caching device to rebuild the internal data structures. 86 | 87 | Remove caching device 88 | --------------------- 89 | If you want to detach your caching device for some reasons (you don't like 90 | dm-writeboost anymore or you try to upgrade the caching device to a newly 91 | perchased device) the safest way to do this is clean the dirty data up from your 92 | caching device first and then deconstrust the dm-writeboost'd device. 93 | You can do this by first suspend/resume the device to drop all transient data 94 | from RAM buffer and then sending drop_caches message to drop dirty cache blocks 95 | from the caching device. 96 | e.g. 97 | dmsetup suspend wbdev; dmsetup resume wbdev 98 | dmsetup message wbdev 0 drop_caches 99 | dmsetup remove wbdev 100 | 101 | Optional args 102 | ------------- 103 | writeback_threshold (%) 104 | accepts: 0..100 105 | default: 0 (writeback disabled) 106 | Writeback can be suppressed when the load of backing device is higher than 107 | $writeback_threshold. 108 | 109 | nr_max_batched_writeback 110 | accepts: 1..32 111 | default: 32 112 | As optimization, dm-writeboost writes back $nr_max_batched_writeback segments 113 | simultaneously. The dirty caches in the segments are sorted in ascending order 114 | of the destination address and then written back. Setting large value can boost 115 | the writeback performance. 116 | 117 | update_sb_record_interval (sec) 118 | accepts: 0..3600 119 | default: 0 (disabled) 120 | Update the superblock every $update_sb_record_interval second. 0 means disabled. 121 | Superblock memorizes the last segment ID that was written back. 122 | By enabling this, dm-writeboost in resuming can skip segments that's already 123 | written back and thus can shorten the resume time. 124 | 125 | sync_data_interval (sec) 126 | accepts: 0..3600 127 | default: 0 (disabled) 128 | Sync all the volatile data every $sync_data_interval second. 0 means disabled. 129 | 130 | read_cache_threshold (int) 131 | accepts: 0..127 132 | default: 0 (read caching disabled) 133 | Reads larger than $read_cache_threshold * 4KB consecutive won't be staged. 134 | 135 | write_around_mode (bool) 136 | accepts: 0..1 137 | default: 0 138 | By enabling this, dm-writeboost writes data directly to the backing device. 139 | 140 | Messages 141 | -------- 142 | You can change the behavior of dm-writeboost'd device by message. 143 | 144 | (1) Optional args 145 | The following optional args can be tuned online. 146 | e.g. dmsetup message wbdev 0 writeback_threshold 70 147 | 148 | - writeback_threshold 149 | - nr_max_batched_writeback 150 | - update_sb_record_interval 151 | - sync_data_interval 152 | - read_cache_threshold 153 | 154 | (2) Others 155 | drop_caches 156 | Wait for all dirty data on the caching device to be written back to the backing 157 | device. This is interruptible. 158 | clear_stat 159 | Clear the statistic info (see `Status`). 160 | 161 | Status 162 | ------ 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | <#optional args> 173 | -------------------------------------------------------------------------------- /src/dm-writeboost.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of dm-writeboost 3 | * Copyright (C) 2012-2025 Akira Hayakawa 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program; if not, write to the Free Software Foundation, Inc., 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | */ 19 | 20 | #ifndef DM_WRITEBOOST_H 21 | #define DM_WRITEBOOST_H 22 | 23 | #define DM_MSG_PREFIX "writeboost" 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | /* We use RHEL_RELEASE_VERSION to compile with RHEL/CentOS 7.3's kernel */ 41 | #ifndef RHEL_RELEASE_CODE 42 | #define RHEL_RELEASE_CODE 0 43 | #define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b)) 44 | #endif 45 | 46 | /*----------------------------------------------------------------------------*/ 47 | 48 | #define SUB_ID(x, y) ((x) > (y) ? (x) - (y) : 0) 49 | 50 | /*----------------------------------------------------------------------------*/ 51 | 52 | /* 53 | * The detail of the disk format (SSD) 54 | * ----------------------------------- 55 | * 56 | * ### Overall 57 | * Superblock (1MB) + Segment + Segment ... 58 | * 59 | * ### Superblock 60 | * Head <---- ----> Tail 61 | * Superblock Header (512B) + ... + Superblock Record (512B) 62 | * 63 | * ### Segment 64 | * segment_header_device (512B) + 65 | * metablock_device * nr_caches_inseg + 66 | * data[0] (4KB) + data[1] + ... + data[nr_cache_inseg - 1] 67 | */ 68 | 69 | /*----------------------------------------------------------------------------*/ 70 | 71 | /* 72 | * Superblock Header (Immutable) 73 | * ----------------------------- 74 | * First one sector of the super block region whose value is unchanged after 75 | * formatted. 76 | */ 77 | #define WB_MAGIC 0x57427374 /* Magic number "WBst" */ 78 | struct superblock_header_device { 79 | __le32 magic; 80 | } __packed; 81 | 82 | /* 83 | * Superblock Record (Mutable) 84 | * --------------------------- 85 | * Last one sector of the superblock region. Record the current cache status if 86 | * required. 87 | */ 88 | struct superblock_record_device { 89 | __le64 last_writeback_segment_id; 90 | } __packed; 91 | 92 | /*----------------------------------------------------------------------------*/ 93 | 94 | /* 95 | * The size must be a factor of one sector to avoid starddling neighboring two 96 | * sectors. 97 | */ 98 | struct metablock_device { 99 | __le64 sector; 100 | __u8 dirty_bits; 101 | __u8 padding[16 - (8 + 1)]; /* 16B */ 102 | } __packed; 103 | 104 | struct segment_header_device { 105 | /* 106 | * We assume 1 sector write is atomic. 107 | * This 1 sector region contains important information such as checksum 108 | * of the rest of the segment data. We use 32bit checksum to audit if 109 | * the segment is correctly written to the cache device. 110 | */ 111 | /* - FROM ------------------------------------ */ 112 | __le64 id; 113 | __le32 checksum; 114 | /* 115 | * The number of metablocks in this segment header to be considered in 116 | * log replay. 117 | */ 118 | __u8 length; 119 | __u8 padding[512 - (8 + 4 + 1)]; /* 512B */ 120 | /* - TO -------------------------------------- */ 121 | struct metablock_device mbarr[0]; /* 16B * N */ 122 | } __packed; 123 | 124 | /*----------------------------------------------------------------------------*/ 125 | 126 | struct dirtiness { 127 | bool is_dirty; 128 | u8 data_bits; 129 | }; 130 | 131 | struct metablock { 132 | sector_t sector; /* The original aligned address */ 133 | 134 | u32 idx; /* Const. Index in the metablock array */ 135 | 136 | struct hlist_node ht_list; /* Linked to the hash table */ 137 | 138 | struct dirtiness dirtiness; 139 | }; 140 | 141 | #define SZ_MAX (~(size_t)0) 142 | struct segment_header { 143 | u64 id; /* Must be initialized to 0 */ 144 | 145 | u8 length; /* The number of valid metablocks */ 146 | 147 | u32 start_idx; /* Const */ 148 | sector_t start_sector; /* Const */ 149 | 150 | atomic_t nr_inflight_ios; 151 | 152 | struct metablock mb_array[0]; 153 | }; 154 | 155 | /*----------------------------------------------------------------------------*/ 156 | 157 | /* 158 | * RAM buffer is a buffer that any dirty data are first written into. 159 | */ 160 | struct rambuffer { 161 | struct segment_header *seg; 162 | void *data; 163 | struct bio_list barrier_ios; /* List of deferred bios */ 164 | }; 165 | 166 | /*----------------------------------------------------------------------------*/ 167 | 168 | /* 169 | * Batched and Sorted Writeback 170 | * ---------------------------- 171 | * 172 | * Writeback daemon writes back segments on the cache device effectively. 173 | * "Batched" means it writes back number of segments at the same time in 174 | * asynchronous manner. 175 | * "Sorted" means these writeback IOs are sorted in ascending order of LBA in 176 | * the backing device. Rb-tree is used to sort the writeback IOs. 177 | * 178 | * Reading from the cache device is sequential. 179 | */ 180 | 181 | /* 182 | * Writeback of a cache line (or metablock) 183 | */ 184 | struct writeback_io { 185 | struct rb_node rb_node; 186 | 187 | sector_t sector; /* Key */ 188 | u64 id; /* Key */ 189 | 190 | void *data; 191 | u8 data_bits; 192 | }; 193 | #define writeback_io_from_node(node) \ 194 | rb_entry((node), struct writeback_io, rb_node) 195 | 196 | /* 197 | * Writeback of a segment 198 | */ 199 | struct writeback_segment { 200 | struct segment_header *seg; /* Segment to write back */ 201 | struct writeback_io *ios; 202 | void *buf; /* Sequentially read */ 203 | }; 204 | 205 | /*----------------------------------------------------------------------------*/ 206 | 207 | struct read_cache_cell { 208 | sector_t sector; 209 | void *data; /* 4KB data read */ 210 | atomic_t cancelled; /* Don't include this */ 211 | struct rb_node rb_node; 212 | }; 213 | 214 | struct read_cache_cells { 215 | struct mutex lock; 216 | 217 | u32 size; 218 | struct read_cache_cell *array; 219 | u32 cursor; 220 | atomic_t ack_count; 221 | sector_t last_sector; /* The last read sector in foreground */ 222 | u32 seqcount; 223 | u32 threshold; 224 | bool over_threshold; 225 | /* 226 | * We use RB-tree for lookup data structure that all elements are 227 | * sorted. Cells are sorted by the sector so we can easily detect 228 | * sequence. 229 | */ 230 | struct rb_root rb_root; 231 | struct workqueue_struct *wq; 232 | }; 233 | 234 | /*----------------------------------------------------------------------------*/ 235 | 236 | enum STATFLAG { 237 | WB_STAT_WRITE = 3, /* Write or read */ 238 | WB_STAT_HIT = 2, /* Hit or miss */ 239 | WB_STAT_ON_BUFFER = 1, /* Found on buffer or on the cache device */ 240 | WB_STAT_FULLSIZE = 0, /* Bio is fullsize or partial */ 241 | }; 242 | #define STATLEN (1 << 4) 243 | 244 | enum WB_FLAG { 245 | WB_CREATED = 0, 246 | }; 247 | 248 | #define SEGMENT_SIZE_ORDER 10 249 | #define NR_RAMBUF_POOL 8 250 | 251 | /* 252 | * The context of the cache target instance. 253 | */ 254 | struct wb_device { 255 | struct dm_target *ti; 256 | 257 | struct dm_dev *backing_dev; /* Slow device (HDD) */ 258 | struct dm_dev *cache_dev; /* Fast device (SSD) */ 259 | 260 | bool write_around_mode; 261 | 262 | unsigned nr_ctr_args; 263 | const char **ctr_args; 264 | 265 | bool do_format; /* True if it was the first creation */ 266 | struct mutex io_lock; /* Mutex is light-weighed */ 267 | 268 | /* 269 | * Wq to wait for nr_inflight_ios to be zero. 270 | * nr_inflight_ios of segment header increments inside io_lock. 271 | * While the refcount > 0, the segment can not be overwritten since 272 | * there is at least one bio to direct it. 273 | */ 274 | wait_queue_head_t inflight_ios_wq; 275 | 276 | spinlock_t mb_lock; 277 | 278 | u8 nr_caches_inseg; /* Const */ 279 | 280 | struct kmem_cache *buf_8_cachep; 281 | mempool_t *buf_8_pool; /* 8 sector buffer pool */ 282 | struct workqueue_struct *io_wq; 283 | struct dm_io_client *io_client; 284 | 285 | /*--------------------------------------------------------------------*/ 286 | 287 | /****************** 288 | * Current position 289 | ******************/ 290 | 291 | u32 cursor; /* Metablock index to write next */ 292 | struct segment_header *current_seg; 293 | struct rambuffer *current_rambuf; 294 | 295 | /*--------------------------------------------------------------------*/ 296 | 297 | /********************** 298 | * Segment header array 299 | **********************/ 300 | 301 | u32 nr_segments; /* Const */ 302 | struct large_array *segment_header_array; 303 | 304 | /*--------------------------------------------------------------------*/ 305 | 306 | /******************** 307 | * Chained Hash table 308 | ********************/ 309 | 310 | u32 nr_caches; /* Const */ 311 | struct large_array *htable; 312 | size_t htsize; /* Number of buckets in the hash table */ 313 | 314 | /* 315 | * Our hashtable has one special bucket called null head. 316 | * Orphan metablocks are linked to the null head. 317 | */ 318 | struct ht_head *null_head; 319 | 320 | /*--------------------------------------------------------------------*/ 321 | 322 | /***************** 323 | * RAM buffer pool 324 | *****************/ 325 | 326 | struct rambuffer *rambuf_pool; 327 | 328 | atomic64_t last_queued_segment_id; 329 | 330 | /*--------------------------------------------------------------------*/ 331 | 332 | /******************** 333 | * One-shot Writeback 334 | ********************/ 335 | 336 | struct dm_kcopyd_client *copier; 337 | 338 | /*--------------------------------------------------------------------*/ 339 | 340 | /************** 341 | * Flush Daemon 342 | **************/ 343 | 344 | struct task_struct *flush_daemon; 345 | 346 | /* 347 | * Wait for a specified segment to be flushed. Non-interruptible 348 | * cf. wait_for_flushing() 349 | */ 350 | wait_queue_head_t flush_wait_queue; 351 | 352 | atomic64_t last_flushed_segment_id; 353 | 354 | /*--------------------------------------------------------------------*/ 355 | 356 | /************************* 357 | * Barrier deadline worker 358 | *************************/ 359 | 360 | /* 361 | * We shouldn't use kernel-global workqueue for this worker 362 | * because it may cause timeout for the flush requests. 363 | */ 364 | struct workqueue_struct *barrier_wq; 365 | struct work_struct flush_barrier_work; 366 | struct bio_list barrier_ios; /* List of barrier requests */ 367 | 368 | /*--------------------------------------------------------------------*/ 369 | 370 | /****************** 371 | * Writeback Daemon 372 | ******************/ 373 | 374 | struct task_struct *writeback_daemon; 375 | int allow_writeback; 376 | int urge_writeback; /* Start writeback immediately */ 377 | int force_drop; /* Don't stop writeback */ 378 | atomic64_t last_writeback_segment_id; 379 | 380 | /* 381 | * Wait for a specified segment to be written back. Non-interruptible 382 | * cf. wait_for_writeback() 383 | */ 384 | wait_queue_head_t writeback_wait_queue; 385 | 386 | /* 387 | * Wait for writing back all the dirty caches. Interruptible 388 | */ 389 | wait_queue_head_t wait_drop_caches; 390 | atomic64_t nr_dirty_caches; 391 | 392 | /* 393 | * Wait for a background writeback complete 394 | */ 395 | wait_queue_head_t writeback_io_wait_queue; 396 | atomic_t writeback_io_count; 397 | atomic_t writeback_fail_count; 398 | 399 | u32 nr_max_batched_writeback; /* Tunable */ 400 | u32 nr_max_batched_writeback_saved; 401 | 402 | struct rb_root writeback_tree; 403 | 404 | u32 nr_writeback_segs; 405 | struct writeback_segment **writeback_segs; 406 | u32 nr_cur_batched_writeback; /* Number of segments to be written back */ 407 | u32 nr_empty_segs; 408 | 409 | /*--------------------------------------------------------------------*/ 410 | 411 | /********************* 412 | * Writeback Modulator 413 | *********************/ 414 | 415 | struct task_struct *writeback_modulator; 416 | u8 writeback_threshold; /* Tunable */ 417 | u8 writeback_threshold_saved; 418 | 419 | /*--------------------------------------------------------------------*/ 420 | 421 | /*************************** 422 | * Superblock Record Updater 423 | ***************************/ 424 | 425 | struct task_struct *sb_record_updater; 426 | unsigned long update_sb_record_interval; /* Tunable */ 427 | unsigned long update_sb_record_interval_saved; 428 | 429 | /*--------------------------------------------------------------------*/ 430 | 431 | /******************* 432 | * Data Synchronizer 433 | *******************/ 434 | 435 | struct task_struct *data_synchronizer; 436 | unsigned long sync_data_interval; /* Tunable */ 437 | unsigned long sync_data_interval_saved; 438 | 439 | /*--------------------------------------------------------------------*/ 440 | 441 | /************** 442 | * Read Caching 443 | **************/ 444 | 445 | u32 nr_read_cache_cells; 446 | u32 nr_read_cache_cells_saved; 447 | struct work_struct read_cache_work; 448 | struct read_cache_cells *read_cache_cells; 449 | u32 read_cache_threshold; /* Tunable */ 450 | u32 read_cache_threshold_saved; 451 | 452 | /*--------------------------------------------------------------------*/ 453 | 454 | /************ 455 | * Statistics 456 | ************/ 457 | 458 | atomic64_t stat[STATLEN]; 459 | atomic64_t count_non_full_flushed; 460 | 461 | /*--------------------------------------------------------------------*/ 462 | 463 | unsigned long flags; 464 | }; 465 | 466 | /*----------------------------------------------------------------------------*/ 467 | 468 | struct write_io { 469 | void *data; /* 4KB */ 470 | u8 data_bits; 471 | }; 472 | 473 | void acquire_new_seg(struct wb_device *, u64 id); 474 | void cursor_init(struct wb_device *); 475 | void flush_current_buffer(struct wb_device *); 476 | void inc_nr_dirty_caches(struct wb_device *); 477 | void dec_nr_dirty_caches(struct wb_device *); 478 | bool mark_clean_mb(struct wb_device *, struct metablock *); 479 | struct dirtiness read_mb_dirtiness(struct wb_device *, struct segment_header *, struct metablock *); 480 | int prepare_overwrite(struct wb_device *, struct segment_header *, struct metablock *old_mb, struct write_io *, u8 overwrite_bits); 481 | 482 | /*----------------------------------------------------------------------------*/ 483 | 484 | #define ASSERT(cond) BUG_ON(!(cond)) 485 | 486 | #define check_buffer_alignment(buf) \ 487 | do_check_buffer_alignment(buf, #buf, __func__) 488 | void do_check_buffer_alignment(void *, const char *, const char *); 489 | 490 | void bio_io_success_compat(struct bio *bio); 491 | 492 | /* 493 | * dm_io wrapper 494 | * thread: run dm_io in other thread to avoid deadlock 495 | */ 496 | #define wb_io(io_req, num_regions, regions, err_bits, thread) \ 497 | wb_io_internal(wb, (io_req), (num_regions), (regions), \ 498 | (err_bits), (thread), __func__) 499 | int wb_io_internal(struct wb_device *, struct dm_io_request *, 500 | unsigned num_regions, struct dm_io_region *, 501 | unsigned long *err_bits, bool thread, const char *caller); 502 | 503 | sector_t dm_devsize(struct dm_dev *); 504 | 505 | /*----------------------------------------------------------------------------*/ 506 | 507 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(6,0,0) 508 | #define req_is_write(req) op_is_write((req)->bi_opf) 509 | #define WB_IO_WRITE .bi_opf = REQ_OP_WRITE 510 | #define WB_IO_READ .bi_opf = REQ_OP_READ 511 | #define WB_IO_WRITE_FUA .bi_opf = REQ_OP_WRITE | REQ_FUA 512 | #define bio_is_barrier(bio) ((bio)->bi_opf & REQ_PREFLUSH) 513 | #define bio_is_fua(bio) ((bio)->bi_opf & REQ_FUA) 514 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0) 515 | #define req_is_write(req) op_is_write((req)->bi_op) 516 | #define WB_IO_WRITE .bi_op = REQ_OP_WRITE, .bi_op_flags = 0 517 | #define WB_IO_READ .bi_op = REQ_OP_READ, .bi_op_flags = 0 518 | #define WB_IO_WRITE_FUA .bi_op = REQ_OP_WRITE, .bi_op_flags = REQ_FUA 519 | #define bio_is_barrier(bio) ((bio)->bi_opf & REQ_PREFLUSH) 520 | #define bio_is_fua(bio) ((bio)->bi_opf & REQ_FUA) 521 | #else 522 | #define req_is_write(req) ((req)->bi_rw == WRITE) 523 | #define bio_is_barrier(bio) ((bio)->bi_rw & REQ_FLUSH) 524 | #define bio_is_fua(bio) ((bio)->bi_rw & REQ_FUA) 525 | #define WB_IO_WRITE .bi_rw = WRITE 526 | #define WB_IO_READ .bi_rw = READ 527 | #define WB_IO_WRITE_FUA .bi_rw = WRITE_FUA 528 | #endif 529 | 530 | /*----------------------------------------------------------------------------*/ 531 | 532 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) 533 | #define read_once(x) READ_ONCE(x) 534 | #else 535 | #define read_once(x) ACCESS_ONCE(x) 536 | #endif 537 | 538 | /*----------------------------------------------------------------------------*/ 539 | 540 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,12,0) 541 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x) 542 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0) 543 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x, y) 544 | #else 545 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x, y, NULL) 546 | #endif 547 | 548 | #endif 549 | -------------------------------------------------------------------------------- /src/dm-writeboost-daemon.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of dm-writeboost 3 | * Copyright (C) 2012-2025 Akira Hayakawa 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program; if not, write to the Free Software Foundation, Inc., 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | */ 19 | 20 | #include "dm-writeboost.h" 21 | #include "dm-writeboost-metadata.h" 22 | #include "dm-writeboost-daemon.h" 23 | 24 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,7,0) 25 | #include 26 | #endif 27 | #include 28 | 29 | /*----------------------------------------------------------------------------*/ 30 | 31 | void queue_barrier_io(struct wb_device *wb, struct bio *bio) 32 | { 33 | mutex_lock(&wb->io_lock); 34 | bio_list_add(&wb->barrier_ios, bio); 35 | mutex_unlock(&wb->io_lock); 36 | 37 | /* 38 | * queue_work does nothing if the work is already in the queue. 39 | * So we don't have to care about it. 40 | */ 41 | queue_work(wb->barrier_wq, &wb->flush_barrier_work); 42 | } 43 | 44 | void flush_barrier_ios(struct work_struct *work) 45 | { 46 | struct wb_device *wb = container_of( 47 | work, struct wb_device, flush_barrier_work); 48 | 49 | if (bio_list_empty(&wb->barrier_ios)) 50 | return; 51 | 52 | atomic64_inc(&wb->count_non_full_flushed); 53 | flush_current_buffer(wb); 54 | } 55 | 56 | /*----------------------------------------------------------------------------*/ 57 | 58 | static void process_deferred_barriers(struct wb_device *wb, struct rambuffer *rambuf) 59 | { 60 | bool has_barrier = !bio_list_empty(&rambuf->barrier_ios); 61 | if (has_barrier) { 62 | struct bio *bio; 63 | 64 | /* Make all the preceding data persistent. */ 65 | int err = dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO); 66 | 67 | /* Ack the chained barrier requests. */ 68 | while ((bio = bio_list_pop(&rambuf->barrier_ios))) 69 | /* 70 | * We won't endio with the err returned from blkdev_issue_flush 71 | * because it's sort of meaningless to return a detailed error here 72 | * and other parts of the code even in foreground round the error 73 | * off to bio_io_error which returns a generic error which results in 74 | * IOERR in userland. 75 | */ 76 | if (unlikely(err)) 77 | bio_io_error(bio); 78 | else 79 | bio_io_success_compat(bio); 80 | } 81 | } 82 | 83 | static bool should_flush(struct wb_device *wb) 84 | { 85 | return atomic64_read(&wb->last_queued_segment_id) > 86 | atomic64_read(&wb->last_flushed_segment_id); 87 | } 88 | 89 | static void do_flush_proc(struct wb_device *wb) 90 | { 91 | struct segment_header *seg; 92 | struct rambuffer *rambuf; 93 | u64 id; 94 | struct dm_io_request io_req; 95 | struct dm_io_region region; 96 | 97 | if (!should_flush(wb)) { 98 | schedule_timeout_interruptible(msecs_to_jiffies(1000)); 99 | return; 100 | } 101 | 102 | id = atomic64_read(&wb->last_flushed_segment_id) + 1; 103 | 104 | smp_rmb(); 105 | 106 | rambuf = get_rambuffer_by_id(wb, id); 107 | seg = rambuf->seg; 108 | 109 | io_req = (struct dm_io_request) { 110 | WB_IO_WRITE, 111 | .client = wb->io_client, 112 | .notify.fn = NULL, 113 | .mem.type = DM_IO_VMA, 114 | .mem.ptr.addr = rambuf->data, 115 | }; 116 | region = (struct dm_io_region) { 117 | .bdev = wb->cache_dev->bdev, 118 | .sector = seg->start_sector, 119 | .count = (seg->length + 1) << 3, 120 | }; 121 | 122 | if (wb_io(&io_req, 1, ®ion, NULL, false)) 123 | return; 124 | 125 | /* 126 | * Deferred ACK for barrier requests 127 | * To serialize barrier ACK in logging we wait for the previous segment 128 | * to be persistently written (if needed). 129 | */ 130 | process_deferred_barriers(wb, rambuf); 131 | 132 | /* 133 | * We can count up the last_flushed_segment_id only after segment 134 | * is written persistently. Counting up the id is serialized. 135 | */ 136 | smp_wmb(); 137 | atomic64_inc(&wb->last_flushed_segment_id); 138 | wake_up(&wb->flush_wait_queue); 139 | } 140 | 141 | int flush_daemon_proc(void *data) 142 | { 143 | struct wb_device *wb = data; 144 | while (!kthread_should_stop()) 145 | do_flush_proc(wb); 146 | return 0; 147 | } 148 | 149 | void wait_for_flushing(struct wb_device *wb, u64 id) 150 | { 151 | wait_event(wb->flush_wait_queue, 152 | atomic64_read(&wb->last_flushed_segment_id) >= id); 153 | smp_rmb(); 154 | } 155 | 156 | /*----------------------------------------------------------------------------*/ 157 | 158 | static void writeback_endio(unsigned long error, void *context) 159 | { 160 | struct wb_device *wb = context; 161 | 162 | if (error) 163 | atomic_inc(&wb->writeback_fail_count); 164 | 165 | if (atomic_dec_and_test(&wb->writeback_io_count)) 166 | wake_up(&wb->writeback_io_wait_queue); 167 | } 168 | 169 | static void submit_writeback_io(struct wb_device *wb, struct writeback_io *writeback_io) 170 | { 171 | ASSERT(writeback_io->data_bits > 0); 172 | 173 | if (writeback_io->data_bits == 255) { 174 | struct dm_io_request io_req_w = { 175 | WB_IO_WRITE, 176 | .client = wb->io_client, 177 | .notify.fn = writeback_endio, 178 | .notify.context = wb, 179 | .mem.type = DM_IO_VMA, 180 | .mem.ptr.addr = writeback_io->data, 181 | }; 182 | struct dm_io_region region_w = { 183 | .bdev = wb->backing_dev->bdev, 184 | .sector = writeback_io->sector, 185 | .count = 1 << 3, 186 | }; 187 | if (wb_io(&io_req_w, 1, ®ion_w, NULL, false)) 188 | writeback_endio(1, wb); 189 | } else { 190 | u8 i; 191 | for (i = 0; i < 8; i++) { 192 | struct dm_io_request io_req_w; 193 | struct dm_io_region region_w; 194 | 195 | bool bit_on = writeback_io->data_bits & (1 << i); 196 | if (!bit_on) 197 | continue; 198 | 199 | io_req_w = (struct dm_io_request) { 200 | WB_IO_WRITE, 201 | .client = wb->io_client, 202 | .notify.fn = writeback_endio, 203 | .notify.context = wb, 204 | .mem.type = DM_IO_VMA, 205 | .mem.ptr.addr = writeback_io->data + (i << 9), 206 | }; 207 | region_w = (struct dm_io_region) { 208 | .bdev = wb->backing_dev->bdev, 209 | .sector = writeback_io->sector + i, 210 | .count = 1, 211 | }; 212 | if (wb_io(&io_req_w, 1, ®ion_w, NULL, false)) 213 | writeback_endio(1, wb); 214 | } 215 | } 216 | } 217 | 218 | static void submit_writeback_ios(struct wb_device *wb) 219 | { 220 | struct blk_plug plug; 221 | struct rb_root wt = wb->writeback_tree; 222 | blk_start_plug(&plug); 223 | while (!RB_EMPTY_ROOT(&wt)) { 224 | struct writeback_io *writeback_io = writeback_io_from_node(rb_first(&wt)); 225 | rb_erase(&writeback_io->rb_node, &wt); 226 | submit_writeback_io(wb, writeback_io); 227 | } 228 | blk_finish_plug(&plug); 229 | } 230 | 231 | /* 232 | * Compare two writeback IOs 233 | * If the two have the same sector then compare them with the IDs. 234 | * We process the older ID first and then overwrites with the older. 235 | * 236 | * (10, 3) < (11, 1) 237 | * (10, 3) < (10, 4) 238 | */ 239 | static bool compare_writeback_io(struct writeback_io *a, struct writeback_io *b) 240 | { 241 | ASSERT(a); 242 | ASSERT(b); 243 | if (a->sector < b->sector) 244 | return true; 245 | if (a->id < b->id) 246 | return true; 247 | return false; 248 | } 249 | 250 | static void inc_writeback_io_count(u8 data_bits, size_t *writeback_io_count) 251 | { 252 | if (data_bits == 255) { 253 | (*writeback_io_count)++; 254 | } else { 255 | u8 i; 256 | for (i = 0; i < 8; i++) { 257 | if (data_bits & (1 << i)) 258 | (*writeback_io_count)++; 259 | } 260 | } 261 | } 262 | 263 | /* 264 | * Add writeback IO to RB-tree for sorted writeback. 265 | * All writeback IOs are sorted in ascending order. 266 | */ 267 | static void add_writeback_io(struct wb_device *wb, struct writeback_io *writeback_io) 268 | { 269 | struct rb_node **rbp, *parent; 270 | rbp = &wb->writeback_tree.rb_node; 271 | parent = NULL; 272 | while (*rbp) { 273 | struct writeback_io *parent_io; 274 | parent = *rbp; 275 | parent_io = writeback_io_from_node(parent); 276 | 277 | if (compare_writeback_io(writeback_io, parent_io)) 278 | rbp = &(*rbp)->rb_left; 279 | else 280 | rbp = &(*rbp)->rb_right; 281 | } 282 | rb_link_node(&writeback_io->rb_node, parent, rbp); 283 | rb_insert_color(&writeback_io->rb_node, &wb->writeback_tree); 284 | } 285 | 286 | static int fill_writeback_seg(struct wb_device *wb, struct writeback_segment *writeback_seg) 287 | { 288 | struct segment_header *seg = writeback_seg->seg; 289 | 290 | struct dm_io_request io_req_r = { 291 | WB_IO_READ, 292 | .client = wb->io_client, 293 | .notify.fn = NULL, 294 | .mem.type = DM_IO_VMA, 295 | .mem.ptr.addr = writeback_seg->buf, 296 | }; 297 | struct dm_io_region region_r = { 298 | .bdev = wb->cache_dev->bdev, 299 | .sector = seg->start_sector + (1 << 3), /* Header excluded */ 300 | .count = seg->length << 3, 301 | }; 302 | 303 | /* 304 | * dm_io() allows region.count = 0 305 | * so we don't need to skip here in case of seg->length = 0 306 | */ 307 | return wb_io(&io_req_r, 1, ®ion_r, NULL, false); 308 | } 309 | 310 | static void prepare_writeback_ios(struct wb_device *wb, struct writeback_segment *writeback_seg, 311 | size_t *writeback_io_count) 312 | { 313 | struct segment_header *seg = writeback_seg->seg; 314 | 315 | u8 i; 316 | for (i = 0; i < seg->length; i++) { 317 | struct writeback_io *writeback_io; 318 | 319 | struct metablock *mb = seg->mb_array + i; 320 | struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, mb); 321 | ASSERT(dirtiness.data_bits > 0); 322 | if (!dirtiness.is_dirty) 323 | continue; 324 | 325 | writeback_io = writeback_seg->ios + i; 326 | writeback_io->sector = mb->sector; 327 | writeback_io->id = seg->id; 328 | /* writeback_io->data is already set */ 329 | writeback_io->data_bits = dirtiness.data_bits; 330 | 331 | inc_writeback_io_count(writeback_io->data_bits, writeback_io_count); 332 | add_writeback_io(wb, writeback_io); 333 | } 334 | } 335 | 336 | void mark_clean_seg(struct wb_device *wb, struct segment_header *seg) 337 | { 338 | u8 i; 339 | for (i = 0; i < seg->length; i++) { 340 | struct metablock *mb = seg->mb_array + i; 341 | if (mark_clean_mb(wb, mb)) 342 | dec_nr_dirty_caches(wb); 343 | } 344 | } 345 | 346 | /* 347 | * Try writeback some specified segs and returns if all writeback ios succeeded. 348 | */ 349 | static bool try_writeback_segs(struct wb_device *wb) 350 | { 351 | struct writeback_segment *writeback_seg; 352 | size_t writeback_io_count = 0; 353 | u32 k; 354 | 355 | /* Create RB-tree */ 356 | wb->writeback_tree = RB_ROOT; 357 | for (k = 0; k < wb->nr_cur_batched_writeback; k++) { 358 | writeback_seg = *(wb->writeback_segs + k); 359 | 360 | if (fill_writeback_seg(wb, writeback_seg)) 361 | return false; 362 | 363 | prepare_writeback_ios(wb, writeback_seg, &writeback_io_count); 364 | } 365 | 366 | atomic_set(&wb->writeback_io_count, writeback_io_count); 367 | atomic_set(&wb->writeback_fail_count, 0); 368 | 369 | /* Pop rbnodes out of the tree and submit writeback I/Os */ 370 | submit_writeback_ios(wb); 371 | wait_event(wb->writeback_io_wait_queue, !atomic_read(&wb->writeback_io_count)); 372 | 373 | return atomic_read(&wb->writeback_fail_count) == 0; 374 | } 375 | 376 | static bool do_writeback_segs(struct wb_device *wb) 377 | { 378 | if (!try_writeback_segs(wb)) 379 | return false; 380 | 381 | dm_blkdev_issue_flush(wb->backing_dev->bdev, GFP_NOIO); 382 | return true; 383 | } 384 | 385 | /* 386 | * Calculate the number of segments to write back. 387 | */ 388 | void update_nr_empty_segs(struct wb_device *wb) 389 | { 390 | wb->nr_empty_segs = 391 | atomic64_read(&wb->last_writeback_segment_id) + wb->nr_segments 392 | - wb->current_seg->id; 393 | } 394 | 395 | static u32 calc_nr_writeback(struct wb_device *wb) 396 | { 397 | u32 nr_writeback_candidates = 398 | atomic64_read(&wb->last_flushed_segment_id) 399 | - atomic64_read(&wb->last_writeback_segment_id); 400 | 401 | u32 nr_max_batch = read_once(wb->nr_max_batched_writeback); 402 | if (wb->nr_writeback_segs != nr_max_batch) 403 | try_alloc_writeback_ios(wb, nr_max_batch, GFP_NOIO | __GFP_NOWARN); 404 | 405 | return min3(nr_writeback_candidates, wb->nr_writeback_segs, wb->nr_empty_segs + 1); 406 | } 407 | 408 | static bool should_writeback(struct wb_device *wb) 409 | { 410 | return read_once(wb->allow_writeback) || 411 | read_once(wb->urge_writeback) || 412 | read_once(wb->force_drop); 413 | } 414 | 415 | static void do_writeback_proc(struct wb_device *wb) 416 | { 417 | u32 k, nr_writeback_tbd; 418 | 419 | if (!should_writeback(wb)) { 420 | schedule_timeout_interruptible(msecs_to_jiffies(1000)); 421 | return; 422 | } 423 | 424 | nr_writeback_tbd = calc_nr_writeback(wb); 425 | if (!nr_writeback_tbd) { 426 | schedule_timeout_interruptible(msecs_to_jiffies(1000)); 427 | return; 428 | } 429 | 430 | smp_rmb(); 431 | 432 | /* Store segments into writeback_segs */ 433 | for (k = 0; k < nr_writeback_tbd; k++) { 434 | struct writeback_segment *writeback_seg = *(wb->writeback_segs + k); 435 | writeback_seg->seg = get_segment_header_by_id(wb, 436 | atomic64_read(&wb->last_writeback_segment_id) + 1 + k); 437 | } 438 | wb->nr_cur_batched_writeback = nr_writeback_tbd; 439 | 440 | if (!do_writeback_segs(wb)) 441 | return; 442 | 443 | /* A segment after written back is clean */ 444 | for (k = 0; k < wb->nr_cur_batched_writeback; k++) { 445 | struct writeback_segment *writeback_seg = *(wb->writeback_segs + k); 446 | mark_clean_seg(wb, writeback_seg->seg); 447 | } 448 | 449 | smp_wmb(); 450 | atomic64_add(wb->nr_cur_batched_writeback, &wb->last_writeback_segment_id); 451 | wake_up(&wb->writeback_wait_queue); 452 | } 453 | 454 | int writeback_daemon_proc(void *data) 455 | { 456 | struct wb_device *wb = data; 457 | while (!kthread_should_stop()) 458 | do_writeback_proc(wb); 459 | return 0; 460 | } 461 | 462 | /* 463 | * Wait for a segment to be written back. 464 | * The segment after written back is clean. 465 | */ 466 | void wait_for_writeback(struct wb_device *wb, u64 id) 467 | { 468 | if (atomic64_read(&wb->last_writeback_segment_id) < id) { 469 | wb->urge_writeback = true; 470 | wake_up_process(wb->writeback_daemon); 471 | wait_event(wb->writeback_wait_queue, 472 | atomic64_read(&wb->last_writeback_segment_id) >= id); 473 | wb->urge_writeback = false; 474 | } 475 | smp_rmb(); 476 | } 477 | 478 | /*----------------------------------------------------------------------------*/ 479 | 480 | int writeback_modulator_proc(void *data) 481 | { 482 | struct wb_device *wb = data; 483 | 484 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) 485 | struct block_device *part = wb->backing_dev->bdev; 486 | #else 487 | struct hd_struct *part = wb->backing_dev->bdev->bd_part; 488 | #endif 489 | unsigned long old = 0, new, util; 490 | unsigned long intvl = 1000; 491 | 492 | while (!kthread_should_stop()) { 493 | new = jiffies_to_msecs(part_stat_read(part, io_ticks)); 494 | 495 | util = div_u64(100 * (new - old), 1000); 496 | 497 | if (util < read_once(wb->writeback_threshold)) 498 | wb->allow_writeback = true; 499 | else 500 | wb->allow_writeback = false; 501 | 502 | old = new; 503 | 504 | update_nr_empty_segs(wb); 505 | 506 | schedule_timeout_interruptible(msecs_to_jiffies(intvl)); 507 | } 508 | return 0; 509 | } 510 | 511 | /*----------------------------------------------------------------------------*/ 512 | 513 | static void update_superblock_record(struct wb_device *wb) 514 | { 515 | struct superblock_record_device o; 516 | void *buf; 517 | struct dm_io_request io_req; 518 | struct dm_io_region region; 519 | 520 | o.last_writeback_segment_id = 521 | cpu_to_le64(atomic64_read(&wb->last_writeback_segment_id)); 522 | 523 | buf = mempool_alloc(wb->buf_8_pool, GFP_NOIO); 524 | if (!buf) 525 | return; 526 | 527 | memset(buf, 0, 8 << 9); 528 | memcpy(buf + (7 << 9), &o, sizeof(o)); 529 | 530 | io_req = (struct dm_io_request) { 531 | WB_IO_WRITE_FUA, 532 | .client = wb->io_client, 533 | .notify.fn = NULL, 534 | .mem.type = DM_IO_KMEM, 535 | .mem.ptr.addr = buf, 536 | }; 537 | region = (struct dm_io_region) { 538 | .bdev = wb->cache_dev->bdev, 539 | .sector = (1 << 11) - 8, 540 | .count = 8, 541 | }; 542 | wb_io(&io_req, 1, ®ion, NULL, false); 543 | 544 | mempool_free(buf, wb->buf_8_pool); 545 | } 546 | 547 | int sb_record_updater_proc(void *data) 548 | { 549 | struct wb_device *wb = data; 550 | 551 | unsigned long intvl; 552 | 553 | while (!kthread_should_stop()) { 554 | /* sec -> ms */ 555 | intvl = read_once(wb->update_sb_record_interval) * 1000; 556 | 557 | if (!intvl) { 558 | schedule_timeout_interruptible(msecs_to_jiffies(1000)); 559 | continue; 560 | } 561 | 562 | update_superblock_record(wb); 563 | schedule_timeout_interruptible(msecs_to_jiffies(intvl)); 564 | } 565 | return 0; 566 | } 567 | 568 | /*----------------------------------------------------------------------------*/ 569 | 570 | int data_synchronizer_proc(void *data) 571 | { 572 | struct wb_device *wb = data; 573 | unsigned long intvl; 574 | 575 | while (!kthread_should_stop()) { 576 | /* sec -> ms */ 577 | intvl = read_once(wb->sync_data_interval) * 1000; 578 | 579 | if (!intvl) { 580 | schedule_timeout_interruptible(msecs_to_jiffies(1000)); 581 | continue; 582 | } 583 | 584 | flush_current_buffer(wb); 585 | dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO); 586 | schedule_timeout_interruptible(msecs_to_jiffies(intvl)); 587 | } 588 | return 0; 589 | } 590 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /src/dm-writeboost-metadata.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of dm-writeboost 3 | * Copyright (C) 2012-2025 Akira Hayakawa 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program; if not, write to the Free Software Foundation, Inc., 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | */ 19 | 20 | #include "dm-writeboost.h" 21 | #include "dm-writeboost-metadata.h" 22 | #include "dm-writeboost-daemon.h" 23 | 24 | /*----------------------------------------------------------------------------*/ 25 | 26 | struct large_array { 27 | u64 nr_elems; 28 | u32 elemsize; 29 | void *data; 30 | }; 31 | 32 | static struct large_array *large_array_alloc(u32 elemsize, u64 nr_elems) 33 | { 34 | struct large_array *arr = kmalloc(sizeof(*arr), GFP_KERNEL); 35 | if (!arr) { 36 | DMERR("Failed to allocate arr"); 37 | return NULL; 38 | } 39 | 40 | arr->elemsize = elemsize; 41 | arr->nr_elems = nr_elems; 42 | 43 | arr->data = vmalloc(elemsize * nr_elems); 44 | if (!arr->data) { 45 | DMERR("Failed to allocate data"); 46 | goto bad_alloc_data; 47 | } 48 | 49 | return arr; 50 | 51 | bad_alloc_data: 52 | kfree(arr); 53 | return NULL; 54 | } 55 | 56 | static void large_array_free(struct large_array *arr) 57 | { 58 | vfree(arr->data); 59 | kfree(arr); 60 | } 61 | 62 | static void *large_array_at(struct large_array *arr, u64 i) 63 | { 64 | return arr->data + arr->elemsize * i; 65 | } 66 | 67 | /*----------------------------------------------------------------------------*/ 68 | 69 | /* 70 | * Get the in-core metablock of the given index. 71 | */ 72 | static struct metablock *mb_at(struct wb_device *wb, u32 idx) 73 | { 74 | u32 idx_inseg; 75 | u32 seg_idx = div_u64_rem(idx, wb->nr_caches_inseg, &idx_inseg); 76 | struct segment_header *seg = large_array_at(wb->segment_header_array, seg_idx); 77 | return seg->mb_array + idx_inseg; 78 | } 79 | 80 | static void mb_array_empty_init(struct wb_device *wb) 81 | { 82 | u32 i; 83 | for (i = 0; i < wb->nr_caches; i++) { 84 | struct metablock *mb = mb_at(wb, i); 85 | INIT_HLIST_NODE(&mb->ht_list); 86 | 87 | mb->idx = i; 88 | mb->dirtiness.data_bits = 0; 89 | mb->dirtiness.is_dirty = false; 90 | } 91 | } 92 | 93 | /* 94 | * Calc the starting sector of the k-th segment 95 | */ 96 | static sector_t calc_segment_header_start(struct wb_device *wb, u32 k) 97 | { 98 | return (1 << 11) + (1 << SEGMENT_SIZE_ORDER) * k; 99 | } 100 | 101 | static u32 calc_nr_segments(struct dm_dev *dev, struct wb_device *wb) 102 | { 103 | sector_t devsize = dm_devsize(dev); 104 | return div_u64(devsize - (1 << 11), 1 << SEGMENT_SIZE_ORDER); 105 | } 106 | 107 | /* 108 | * Get the relative index in a segment of the mb_idx-th metablock 109 | */ 110 | u8 mb_idx_inseg(struct wb_device *wb, u32 mb_idx) 111 | { 112 | u32 tmp32; 113 | div_u64_rem(mb_idx, wb->nr_caches_inseg, &tmp32); 114 | return tmp32; 115 | } 116 | 117 | /* 118 | * Calc the starting sector of the mb_idx-th cache block 119 | */ 120 | sector_t calc_mb_start_sector(struct wb_device *wb, struct segment_header *seg, u32 mb_idx) 121 | { 122 | return seg->start_sector + ((1 + mb_idx_inseg(wb, mb_idx)) << 3); 123 | } 124 | 125 | /* 126 | * Get the segment that contains the passed mb 127 | */ 128 | struct segment_header *mb_to_seg(struct wb_device *wb, struct metablock *mb) 129 | { 130 | struct segment_header *seg; 131 | seg = ((void *) mb) 132 | - mb_idx_inseg(wb, mb->idx) * sizeof(struct metablock) 133 | - sizeof(struct segment_header); 134 | return seg; 135 | } 136 | 137 | bool is_on_buffer(struct wb_device *wb, u32 mb_idx) 138 | { 139 | u32 start = wb->current_seg->start_idx; 140 | if (mb_idx < start) 141 | return false; 142 | 143 | if (mb_idx >= (start + wb->nr_caches_inseg)) 144 | return false; 145 | 146 | return true; 147 | } 148 | 149 | static u32 segment_id_to_idx(struct wb_device *wb, u64 id) 150 | { 151 | u32 idx; 152 | div_u64_rem(id - 1, wb->nr_segments, &idx); 153 | return idx; 154 | } 155 | 156 | static struct segment_header *segment_at(struct wb_device *wb, u32 k) 157 | { 158 | return large_array_at(wb->segment_header_array, k); 159 | } 160 | 161 | /* 162 | * Get the segment from the segment id. 163 | * The index of the segment is calculated from the segment id. 164 | */ 165 | struct segment_header *get_segment_header_by_id(struct wb_device *wb, u64 id) 166 | { 167 | return segment_at(wb, segment_id_to_idx(wb, id)); 168 | } 169 | 170 | /*----------------------------------------------------------------------------*/ 171 | 172 | static int init_segment_header_array(struct wb_device *wb) 173 | { 174 | u32 segment_idx; 175 | 176 | wb->segment_header_array = large_array_alloc( 177 | sizeof(struct segment_header) + 178 | sizeof(struct metablock) * wb->nr_caches_inseg, 179 | wb->nr_segments); 180 | if (!wb->segment_header_array) { 181 | DMERR("Failed to allocate segment_header_array"); 182 | return -ENOMEM; 183 | } 184 | 185 | for (segment_idx = 0; segment_idx < wb->nr_segments; segment_idx++) { 186 | struct segment_header *seg = large_array_at(wb->segment_header_array, segment_idx); 187 | 188 | seg->id = 0; 189 | seg->length = 0; 190 | atomic_set(&seg->nr_inflight_ios, 0); 191 | 192 | /* Const values */ 193 | seg->start_idx = wb->nr_caches_inseg * segment_idx; 194 | seg->start_sector = calc_segment_header_start(wb, segment_idx); 195 | } 196 | 197 | mb_array_empty_init(wb); 198 | 199 | return 0; 200 | } 201 | 202 | static void free_segment_header_array(struct wb_device *wb) 203 | { 204 | large_array_free(wb->segment_header_array); 205 | } 206 | 207 | /*----------------------------------------------------------------------------*/ 208 | 209 | struct ht_head { 210 | struct hlist_head ht_list; 211 | }; 212 | 213 | static int ht_empty_init(struct wb_device *wb) 214 | { 215 | u32 idx; 216 | size_t i, nr_heads; 217 | struct large_array *arr; 218 | 219 | wb->htsize = wb->nr_caches; 220 | nr_heads = wb->htsize + 1; 221 | arr = large_array_alloc(sizeof(struct ht_head), nr_heads); 222 | if (!arr) { 223 | DMERR("Failed to allocate htable"); 224 | return -ENOMEM; 225 | } 226 | 227 | wb->htable = arr; 228 | 229 | for (i = 0; i < nr_heads; i++) { 230 | struct ht_head *hd = large_array_at(arr, i); 231 | INIT_HLIST_HEAD(&hd->ht_list); 232 | } 233 | 234 | wb->null_head = large_array_at(wb->htable, wb->htsize); 235 | 236 | for (idx = 0; idx < wb->nr_caches; idx++) { 237 | struct metablock *mb = mb_at(wb, idx); 238 | hlist_add_head(&mb->ht_list, &wb->null_head->ht_list); 239 | } 240 | 241 | return 0; 242 | } 243 | 244 | static void free_ht(struct wb_device *wb) 245 | { 246 | large_array_free(wb->htable); 247 | } 248 | 249 | struct ht_head *ht_get_head(struct wb_device *wb, struct lookup_key *key) 250 | { 251 | u32 idx; 252 | div_u64_rem(key->sector >> 3, wb->htsize, &idx); 253 | return large_array_at(wb->htable, idx); 254 | } 255 | 256 | static bool mb_hit(struct metablock *mb, struct lookup_key *key) 257 | { 258 | return mb->sector == key->sector; 259 | } 260 | 261 | /* 262 | * Remove the metablock from the hashtable and link the orphan to the null head. 263 | */ 264 | void ht_del(struct wb_device *wb, struct metablock *mb) 265 | { 266 | struct ht_head *null_head; 267 | 268 | hlist_del(&mb->ht_list); 269 | 270 | null_head = wb->null_head; 271 | hlist_add_head(&mb->ht_list, &null_head->ht_list); 272 | } 273 | 274 | void ht_register(struct wb_device *wb, struct ht_head *head, 275 | struct metablock *mb, struct lookup_key *key) 276 | { 277 | hlist_del(&mb->ht_list); 278 | hlist_add_head(&mb->ht_list, &head->ht_list); 279 | 280 | BUG_ON(key->sector & 7); // should be 4KB aligned 281 | mb->sector = key->sector; 282 | }; 283 | 284 | struct metablock *ht_lookup(struct wb_device *wb, struct ht_head *head, 285 | struct lookup_key *key) 286 | { 287 | struct metablock *mb, *found = NULL; 288 | hlist_for_each_entry(mb, &head->ht_list, ht_list) { 289 | if (mb_hit(mb, key)) { 290 | found = mb; 291 | break; 292 | } 293 | } 294 | return found; 295 | } 296 | 297 | /* 298 | * Remove all the metablock in the segment from the lookup table. 299 | */ 300 | void discard_caches_inseg(struct wb_device *wb, struct segment_header *seg) 301 | { 302 | u8 i; 303 | for (i = 0; i < wb->nr_caches_inseg; i++) { 304 | struct metablock *mb = seg->mb_array + i; 305 | ht_del(wb, mb); 306 | } 307 | } 308 | 309 | /*----------------------------------------------------------------------------*/ 310 | 311 | static int read_superblock_header(struct superblock_header_device *sup, 312 | struct wb_device *wb) 313 | { 314 | int err = 0; 315 | struct dm_io_request io_req_sup; 316 | struct dm_io_region region_sup; 317 | 318 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 319 | if (!buf) 320 | return -ENOMEM; 321 | check_buffer_alignment(buf); 322 | 323 | io_req_sup = (struct dm_io_request) { 324 | WB_IO_READ, 325 | .client = wb->io_client, 326 | .notify.fn = NULL, 327 | .mem.type = DM_IO_KMEM, 328 | .mem.ptr.addr = buf, 329 | }; 330 | region_sup = (struct dm_io_region) { 331 | .bdev = wb->cache_dev->bdev, 332 | .sector = 0, 333 | .count = 8, 334 | }; 335 | err = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); 336 | if (err) 337 | goto bad_io; 338 | 339 | memcpy(sup, buf, sizeof(*sup)); 340 | 341 | bad_io: 342 | mempool_free(buf, wb->buf_8_pool); 343 | return err; 344 | } 345 | 346 | /* 347 | * check if the cache device is already formatted. 348 | * returns 0 iff this routine runs without failure. 349 | */ 350 | static int audit_cache_device(struct wb_device *wb) 351 | { 352 | int err = 0; 353 | struct superblock_header_device sup; 354 | err = read_superblock_header(&sup, wb); 355 | if (err) { 356 | DMERR("read_superblock_header failed"); 357 | return err; 358 | } 359 | 360 | wb->do_format = false; 361 | if (le32_to_cpu(sup.magic) != WB_MAGIC || 362 | wb->write_around_mode) { /* write-around mode should discard all caches */ 363 | wb->do_format = true; 364 | DMERR("Superblock Header: Magic number invalid"); 365 | return 0; 366 | } 367 | 368 | return err; 369 | } 370 | 371 | static int format_superblock_header(struct wb_device *wb) 372 | { 373 | int err = 0; 374 | 375 | struct dm_io_request io_req_sup; 376 | struct dm_io_region region_sup; 377 | 378 | struct superblock_header_device sup = { 379 | .magic = cpu_to_le32(WB_MAGIC), 380 | }; 381 | 382 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 383 | if (!buf) 384 | return -ENOMEM; 385 | 386 | memset(buf, 0, 8 << 9); 387 | memcpy(buf, &sup, sizeof(sup)); 388 | 389 | io_req_sup = (struct dm_io_request) { 390 | WB_IO_WRITE_FUA, 391 | .client = wb->io_client, 392 | .notify.fn = NULL, 393 | .mem.type = DM_IO_KMEM, 394 | .mem.ptr.addr = buf, 395 | }; 396 | region_sup = (struct dm_io_region) { 397 | .bdev = wb->cache_dev->bdev, 398 | .sector = 0, 399 | .count = 8, 400 | }; 401 | err = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); 402 | if (err) 403 | goto bad_io; 404 | 405 | bad_io: 406 | mempool_free(buf, wb->buf_8_pool); 407 | return err; 408 | } 409 | 410 | struct format_segmd_context { 411 | int err; 412 | atomic64_t count; 413 | }; 414 | 415 | static void format_segmd_endio(unsigned long error, void *__context) 416 | { 417 | struct format_segmd_context *context = __context; 418 | if (error) 419 | context->err = 1; 420 | atomic64_dec(&context->count); 421 | } 422 | 423 | struct zeroing_context { 424 | int error; 425 | struct completion complete; 426 | }; 427 | 428 | static void zeroing_complete(int read_err, unsigned long write_err, void *context) 429 | { 430 | struct zeroing_context *zc = context; 431 | if (read_err || write_err) 432 | zc->error = -EIO; 433 | complete(&zc->complete); 434 | } 435 | 436 | /* 437 | * Synchronously zeroes out a region on a device. 438 | */ 439 | static int do_zeroing_region(struct wb_device *wb, struct dm_io_region *region) 440 | { 441 | struct zeroing_context zc; 442 | zc.error = 0; 443 | init_completion(&zc.complete); 444 | dm_kcopyd_zero(wb->copier, 1, region, 0, zeroing_complete, &zc); 445 | wait_for_completion(&zc.complete); 446 | return zc.error; 447 | } 448 | 449 | static int zeroing_full_superblock(struct wb_device *wb) 450 | { 451 | struct dm_io_region region = { 452 | .bdev = wb->cache_dev->bdev, 453 | .sector = 0, 454 | .count = 1 << 11, 455 | }; 456 | return do_zeroing_region(wb, ®ion); 457 | } 458 | 459 | static int format_all_segment_headers(struct wb_device *wb) 460 | { 461 | int err = 0; 462 | struct dm_dev *dev = wb->cache_dev; 463 | u32 i; 464 | 465 | struct format_segmd_context context; 466 | 467 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 468 | if (!buf) 469 | return -ENOMEM; 470 | 471 | memset(buf, 0, 8 << 9); 472 | check_buffer_alignment(buf); 473 | 474 | atomic64_set(&context.count, wb->nr_segments); 475 | context.err = 0; 476 | 477 | /* Submit all the writes asynchronously. */ 478 | for (i = 0; i < wb->nr_segments; i++) { 479 | struct dm_io_request io_req_seg = { 480 | WB_IO_WRITE, 481 | .client = wb->io_client, 482 | .notify.fn = format_segmd_endio, 483 | .notify.context = &context, 484 | .mem.type = DM_IO_KMEM, 485 | .mem.ptr.addr = buf, 486 | }; 487 | struct dm_io_region region_seg = { 488 | .bdev = dev->bdev, 489 | .sector = calc_segment_header_start(wb, i), 490 | .count = (1 << 3), 491 | }; 492 | err = wb_io(&io_req_seg, 1, ®ion_seg, NULL, false); 493 | if (err) 494 | break; 495 | } 496 | 497 | if (err) 498 | goto bad; 499 | 500 | /* Wait for all the writes complete. */ 501 | while (atomic64_read(&context.count)) 502 | schedule_timeout_interruptible(msecs_to_jiffies(100)); 503 | 504 | if (context.err) { 505 | DMERR("I/O failed"); 506 | err = -EIO; 507 | goto bad; 508 | } 509 | 510 | err = dm_blkdev_issue_flush(dev->bdev, GFP_KERNEL); 511 | 512 | bad: 513 | mempool_free(buf, wb->buf_8_pool); 514 | return err; 515 | } 516 | 517 | /* 518 | * Format superblock header and all the segment headers in a cache device 519 | */ 520 | static int format_cache_device(struct wb_device *wb) 521 | { 522 | int err = zeroing_full_superblock(wb); 523 | if (err) { 524 | DMERR("zeroing_full_superblock failed"); 525 | return err; 526 | } 527 | err = format_all_segment_headers(wb); 528 | if (err) { 529 | DMERR("format_all_segment_headers failed"); 530 | return err; 531 | } 532 | err = format_superblock_header(wb); /* First 512B */ 533 | if (err) { 534 | DMERR("format_superblock_header failed"); 535 | return err; 536 | } 537 | return err; 538 | } 539 | 540 | /* 541 | * First check if the superblock and the passed arguments are consistent and 542 | * re-format the cache structure if they are not. 543 | * If you want to re-format the cache device you must zeroes out the first one 544 | * sector of the device. 545 | */ 546 | static int might_format_cache_device(struct wb_device *wb) 547 | { 548 | int err = 0; 549 | 550 | err = audit_cache_device(wb); 551 | if (err) { 552 | DMERR("audit_cache_device failed"); 553 | return err; 554 | } 555 | 556 | if (wb->do_format) { 557 | err = format_cache_device(wb); 558 | if (err) { 559 | DMERR("format_cache_device failed"); 560 | return err; 561 | } 562 | } 563 | 564 | return err; 565 | } 566 | 567 | /*----------------------------------------------------------------------------*/ 568 | 569 | static int init_rambuf_pool(struct wb_device *wb) 570 | { 571 | int err = 0; 572 | size_t i; 573 | 574 | wb->rambuf_pool = kmalloc(sizeof(struct rambuffer) * NR_RAMBUF_POOL, GFP_KERNEL); 575 | if (!wb->rambuf_pool) 576 | return -ENOMEM; 577 | 578 | for (i = 0; i < NR_RAMBUF_POOL; i++) { 579 | void *alloced = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9)); 580 | if (!alloced) { 581 | size_t j; 582 | DMERR("Failed to allocate rambuf->data"); 583 | for (j = 0; j < i; j++) { 584 | vfree(wb->rambuf_pool[j].data); 585 | } 586 | err = -ENOMEM; 587 | goto bad_alloc_data; 588 | } 589 | wb->rambuf_pool[i].data = alloced; 590 | } 591 | 592 | return err; 593 | 594 | bad_alloc_data: 595 | kfree(wb->rambuf_pool); 596 | return err; 597 | } 598 | 599 | static void free_rambuf_pool(struct wb_device *wb) 600 | { 601 | size_t i; 602 | for (i = 0; i < NR_RAMBUF_POOL; i++) 603 | vfree(wb->rambuf_pool[i].data); 604 | kfree(wb->rambuf_pool); 605 | } 606 | 607 | struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id) 608 | { 609 | u32 tmp32; 610 | div_u64_rem(id - 1, NR_RAMBUF_POOL, &tmp32); 611 | return wb->rambuf_pool + tmp32; 612 | } 613 | 614 | /*----------------------------------------------------------------------------*/ 615 | 616 | /* 617 | * Initialize core devices 618 | * - Cache device (SSD) 619 | * - RAM buffers (DRAM) 620 | */ 621 | static int init_devices(struct wb_device *wb) 622 | { 623 | int err = 0; 624 | 625 | err = might_format_cache_device(wb); 626 | if (err) 627 | return err; 628 | 629 | err = init_rambuf_pool(wb); 630 | if (err) { 631 | DMERR("init_rambuf_pool failed"); 632 | return err; 633 | } 634 | 635 | return err; 636 | } 637 | 638 | static void free_devices(struct wb_device *wb) 639 | { 640 | free_rambuf_pool(wb); 641 | } 642 | 643 | /*----------------------------------------------------------------------------*/ 644 | 645 | static int read_superblock_record(struct superblock_record_device *record, 646 | struct wb_device *wb) 647 | { 648 | int err = 0; 649 | struct dm_io_request io_req; 650 | struct dm_io_region region; 651 | 652 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 653 | if (!buf) 654 | return -ENOMEM; 655 | 656 | check_buffer_alignment(buf); 657 | 658 | io_req = (struct dm_io_request) { 659 | WB_IO_READ, 660 | .client = wb->io_client, 661 | .notify.fn = NULL, 662 | .mem.type = DM_IO_KMEM, 663 | .mem.ptr.addr = buf, 664 | }; 665 | region = (struct dm_io_region) { 666 | .bdev = wb->cache_dev->bdev, 667 | .sector = (1 << 11) - 8, 668 | .count = 8, 669 | }; 670 | err = wb_io(&io_req, 1, ®ion, NULL, false); 671 | if (err) 672 | goto bad_io; 673 | 674 | memcpy(record, buf + (7 << 9), sizeof(*record)); 675 | 676 | bad_io: 677 | mempool_free(buf, wb->buf_8_pool); 678 | return err; 679 | } 680 | 681 | /* 682 | * Read out whole segment of @seg to a pre-allocated @buf 683 | */ 684 | static int read_whole_segment(void *buf, struct wb_device *wb, 685 | struct segment_header *seg) 686 | { 687 | struct dm_io_request io_req = { 688 | WB_IO_READ, 689 | .client = wb->io_client, 690 | .notify.fn = NULL, 691 | .mem.type = DM_IO_VMA, 692 | .mem.ptr.addr = buf, 693 | }; 694 | struct dm_io_region region = { 695 | .bdev = wb->cache_dev->bdev, 696 | .sector = seg->start_sector, 697 | .count = 1 << SEGMENT_SIZE_ORDER, 698 | }; 699 | return wb_io(&io_req, 1, ®ion, NULL, false); 700 | } 701 | 702 | /* 703 | * We make a checksum of a segment from the valid data in a segment except the 704 | * first 1 sector. 705 | */ 706 | u32 calc_checksum(void *rambuffer, u8 length) 707 | { 708 | unsigned int len = (4096 - 512) + 4096 * length; 709 | return ~crc32c(0xffffffff, rambuffer + 512, len); 710 | } 711 | 712 | void prepare_segment_header_device(void *rambuffer, 713 | struct wb_device *wb, 714 | struct segment_header *src) 715 | { 716 | struct segment_header_device *dest = rambuffer; 717 | u32 i; 718 | 719 | ASSERT((src->length) == (wb->cursor - src->start_idx)); 720 | 721 | for (i = 0; i < src->length; i++) { 722 | struct metablock *mb = src->mb_array + i; 723 | struct metablock_device *mbdev = dest->mbarr + i; 724 | 725 | mbdev->sector = cpu_to_le64((u64)mb->sector); 726 | mbdev->dirty_bits = mb->dirtiness.is_dirty ? mb->dirtiness.data_bits : 0; 727 | } 728 | 729 | dest->id = cpu_to_le64(src->id); 730 | dest->length = src->length; 731 | dest->checksum = cpu_to_le32(calc_checksum(rambuffer, src->length)); 732 | } 733 | 734 | /*----------------------------------------------------------------------------*/ 735 | 736 | /* 737 | * Apply @i-th metablock in @src to @seg 738 | */ 739 | static int apply_metablock_device(struct wb_device *wb, struct segment_header *seg, 740 | struct segment_header_device *src, u8 i) 741 | { 742 | struct lookup_key key; 743 | struct ht_head *head; 744 | struct metablock *found = NULL, *mb = seg->mb_array + i; 745 | struct metablock_device *mbdev = src->mbarr + i; 746 | 747 | mb->sector = le64_to_cpu(mbdev->sector); 748 | 749 | mb->dirtiness.data_bits = mbdev->dirty_bits ? mbdev->dirty_bits : 255; 750 | mb->dirtiness.is_dirty = mbdev->dirty_bits ? true : false; 751 | 752 | key = (struct lookup_key) { 753 | .sector = mb->sector, 754 | }; 755 | head = ht_get_head(wb, &key); 756 | found = ht_lookup(wb, head, &key); 757 | if (found) { 758 | int err = 0; 759 | u8 i; 760 | struct write_io wio; 761 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 762 | if (!buf) 763 | return -ENOMEM; 764 | 765 | wio = (struct write_io) { 766 | .data = buf, 767 | .data_bits = 0, 768 | }; 769 | err = prepare_overwrite(wb, mb_to_seg(wb, found), found, &wio, mb->dirtiness.data_bits); 770 | if (err) 771 | goto fail_out; 772 | 773 | for (i = 0; i < 8; i++) { 774 | struct dm_io_request io_req; 775 | struct dm_io_region region; 776 | if (!(wio.data_bits & (1 << i))) 777 | continue; 778 | 779 | io_req = (struct dm_io_request) { 780 | WB_IO_WRITE, 781 | .client = wb->io_client, 782 | .notify.fn = NULL, 783 | .mem.type = DM_IO_KMEM, 784 | .mem.ptr.addr = wio.data + (i << 9), 785 | }; 786 | region = (struct dm_io_region) { 787 | .bdev = wb->backing_dev->bdev, 788 | .sector = mb->sector + i, 789 | .count = 1, 790 | }; 791 | err = wb_io(&io_req, 1, ®ion, NULL, true); 792 | if (err) 793 | break; 794 | } 795 | 796 | fail_out: 797 | mempool_free(buf, wb->buf_8_pool); 798 | if (err) 799 | return err; 800 | } 801 | 802 | ht_register(wb, head, mb, &key); 803 | 804 | if (mb->dirtiness.is_dirty) 805 | inc_nr_dirty_caches(wb); 806 | 807 | return 0; 808 | } 809 | 810 | static int apply_segment_header_device(struct wb_device *wb, struct segment_header *seg, 811 | struct segment_header_device *src) 812 | { 813 | int err = 0; 814 | u8 i; 815 | seg->length = src->length; 816 | for (i = 0; i < src->length; i++) { 817 | err = apply_metablock_device(wb, seg, src, i); 818 | if (err) 819 | break; 820 | } 821 | return err; 822 | } 823 | 824 | /* 825 | * Read out only segment header (4KB) of @seg to @buf 826 | */ 827 | static int read_segment_header(void *buf, struct wb_device *wb, 828 | struct segment_header *seg) 829 | { 830 | struct dm_io_request io_req = { 831 | WB_IO_READ, 832 | .client = wb->io_client, 833 | .notify.fn = NULL, 834 | .mem.type = DM_IO_KMEM, 835 | .mem.ptr.addr = buf, 836 | }; 837 | struct dm_io_region region = { 838 | .bdev = wb->cache_dev->bdev, 839 | .sector = seg->start_sector, 840 | .count = 8, 841 | }; 842 | return wb_io(&io_req, 1, ®ion, NULL, false); 843 | } 844 | 845 | /* 846 | * Find the max id from all the segment headers 847 | * @max_id (out) : The max id found 848 | */ 849 | static int do_find_max_id(struct wb_device *wb, u64 *max_id) 850 | { 851 | int err = 0; 852 | u32 k; 853 | 854 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); 855 | if (!buf) 856 | return -ENOMEM; 857 | check_buffer_alignment(buf); 858 | 859 | *max_id = 0; 860 | for (k = 0; k < wb->nr_segments; k++) { 861 | struct segment_header *seg = segment_at(wb, k); 862 | struct segment_header_device *header; 863 | err = read_segment_header(buf, wb, seg); 864 | if (err) 865 | goto out; 866 | 867 | header = buf; 868 | if (le64_to_cpu(header->id) > *max_id) 869 | *max_id = le64_to_cpu(header->id); 870 | } 871 | out: 872 | mempool_free(buf, wb->buf_8_pool); 873 | return err; 874 | } 875 | 876 | static int find_max_id(struct wb_device *wb, u64 *max_id) 877 | { 878 | /* 879 | * Fast path. 880 | * If it's the first creation, we don't need to look over 881 | * the segment headers to know that the max_id is zero. 882 | */ 883 | if (wb->do_format) { 884 | *max_id = 0; 885 | return 0; 886 | } 887 | 888 | return do_find_max_id(wb, max_id); 889 | } 890 | 891 | /* 892 | * Iterate over the logs on the cache device and apply (recover the cache metadata) 893 | * valid (checksum is correct) segments. 894 | * A segment is valid means that the segment was written without any failure 895 | * typically due to unexpected power failure. 896 | * 897 | * @max_id (in/out) 898 | * - in : The max id found in find_max_id() 899 | * - out : The last id applied in this function 900 | */ 901 | static int do_apply_valid_segments(struct wb_device *wb, u64 *max_id) 902 | { 903 | int err = 0; 904 | struct segment_header *seg; 905 | struct segment_header_device *header; 906 | u32 i, start_idx; 907 | 908 | void *rambuf = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9)); 909 | if (!rambuf) 910 | return -ENOMEM; 911 | 912 | /* 913 | * We are starting from the segment next to the newest one, which can 914 | * be the oldest. The id can be zero if the logs didn't lap at all. 915 | */ 916 | start_idx = segment_id_to_idx(wb, *max_id + 1); 917 | *max_id = 0; 918 | 919 | for (i = start_idx; i < (start_idx + wb->nr_segments); i++) { 920 | u32 actual, expected, k; 921 | div_u64_rem(i, wb->nr_segments, &k); 922 | seg = segment_at(wb, k); 923 | 924 | err = read_whole_segment(rambuf, wb, seg); 925 | if (err) 926 | break; 927 | 928 | header = rambuf; 929 | 930 | /* 931 | * We can't break here. 932 | * Consider sequence of id [1,2,3,0,0,0] 933 | * The max_id is 3 and we start from the 4th segment. 934 | * If we break, the valid logs (1,2,3) are ignored. 935 | */ 936 | if (!le64_to_cpu(header->id)) 937 | continue; 938 | 939 | /* 940 | * Compare the checksum 941 | * if they don't match we discard the subsequent logs. 942 | */ 943 | actual = calc_checksum(rambuf, header->length); 944 | expected = le32_to_cpu(header->checksum); 945 | if (actual != expected) { 946 | DMWARN("Checksum incorrect id:%llu checksum: %u != %u", 947 | (long long unsigned int) le64_to_cpu(header->id), 948 | actual, expected); 949 | break; 950 | } 951 | 952 | /* This segment is correct and we apply */ 953 | err = apply_segment_header_device(wb, seg, header); 954 | if (err) 955 | break; 956 | 957 | *max_id = le64_to_cpu(header->id); 958 | } 959 | 960 | vfree(rambuf); 961 | return err; 962 | } 963 | 964 | static int apply_valid_segments(struct wb_device *wb, u64 *max_id) 965 | { 966 | /* 967 | * Fast path. 968 | * If the max_id is zero, there is obviously no valid segments. 969 | * For the fast initialization, we quit here immediately. 970 | */ 971 | if (!(*max_id)) 972 | return 0; 973 | 974 | return do_apply_valid_segments(wb, max_id); 975 | } 976 | 977 | static int infer_last_writeback_id(struct wb_device *wb) 978 | { 979 | int err = 0; 980 | 981 | u64 inferred_last_writeback_id; 982 | u64 record_id; 983 | 984 | struct superblock_record_device record; 985 | err = read_superblock_record(&record, wb); 986 | if (err) 987 | return err; 988 | 989 | inferred_last_writeback_id = 990 | SUB_ID(atomic64_read(&wb->last_flushed_segment_id), wb->nr_segments); 991 | 992 | /* 993 | * If last_writeback_id is recorded on the super block 994 | * we can eliminate unnecessary writeback for the segments that were 995 | * written back before. 996 | */ 997 | record_id = le64_to_cpu(record.last_writeback_segment_id); 998 | if (record_id > inferred_last_writeback_id) { 999 | u64 id; 1000 | for (id = inferred_last_writeback_id + 1; id <= record_id; id++) 1001 | mark_clean_seg(wb, get_segment_header_by_id(wb, id)); 1002 | inferred_last_writeback_id = record_id; 1003 | } 1004 | 1005 | atomic64_set(&wb->last_writeback_segment_id, inferred_last_writeback_id); 1006 | return err; 1007 | } 1008 | 1009 | /* 1010 | * Replay all the log on the cache device to reconstruct the in-memory metadata. 1011 | * 1012 | * Algorithm: 1013 | * 1. Find the maximum id 1014 | * 2. Start from the right. iterate all the log. 1015 | * 2. Skip if id=0 or checkum incorrect 1016 | * 2. Apply otherwise. 1017 | * 1018 | * This algorithm is robust for floppy SSD that may write a segment partially 1019 | * or lose data on its buffer on power fault. 1020 | */ 1021 | static int replay_log_on_cache(struct wb_device *wb) 1022 | { 1023 | int err = 0; 1024 | 1025 | u64 max_id; 1026 | err = find_max_id(wb, &max_id); 1027 | if (err) { 1028 | DMERR("find_max_id failed"); 1029 | return err; 1030 | } 1031 | 1032 | err = apply_valid_segments(wb, &max_id); 1033 | if (err) { 1034 | DMERR("apply_valid_segments failed"); 1035 | return err; 1036 | } 1037 | 1038 | /* Setup last_flushed_segment_id */ 1039 | atomic64_set(&wb->last_flushed_segment_id, max_id); 1040 | 1041 | /* Setup last_queued_segment_id */ 1042 | atomic64_set(&wb->last_queued_segment_id, max_id); 1043 | 1044 | /* Setup last_writeback_segment_id */ 1045 | infer_last_writeback_id(wb); 1046 | 1047 | return err; 1048 | } 1049 | 1050 | /* 1051 | * Acquire and initialize the first segment header for our caching. 1052 | */ 1053 | static void prepare_first_seg(struct wb_device *wb) 1054 | { 1055 | u64 init_segment_id = atomic64_read(&wb->last_flushed_segment_id) + 1; 1056 | acquire_new_seg(wb, init_segment_id); 1057 | cursor_init(wb); 1058 | } 1059 | 1060 | /* 1061 | * Recover all the cache state from the persistent devices 1062 | */ 1063 | static int recover_cache(struct wb_device *wb) 1064 | { 1065 | int err = 0; 1066 | 1067 | err = replay_log_on_cache(wb); 1068 | if (err) { 1069 | DMERR("replay_log_on_cache failed"); 1070 | return err; 1071 | } 1072 | 1073 | prepare_first_seg(wb); 1074 | return 0; 1075 | } 1076 | 1077 | /*----------------------------------------------------------------------------*/ 1078 | 1079 | static struct writeback_segment *alloc_writeback_segment(struct wb_device *wb, gfp_t gfp) 1080 | { 1081 | u8 i; 1082 | 1083 | struct writeback_segment *writeback_seg = kmalloc(sizeof(*writeback_seg), gfp); 1084 | if (!writeback_seg) 1085 | goto bad_writeback_seg; 1086 | 1087 | writeback_seg->ios = kmalloc(wb->nr_caches_inseg * sizeof(struct writeback_io), gfp); 1088 | if (!writeback_seg->ios) 1089 | goto bad_ios; 1090 | 1091 | writeback_seg->buf = vmalloc((1 << (SEGMENT_SIZE_ORDER + 9)) - (1 << 12)); 1092 | if (!writeback_seg->buf) 1093 | goto bad_buf; 1094 | 1095 | for (i = 0; i < wb->nr_caches_inseg; i++) { 1096 | struct writeback_io *writeback_io = writeback_seg->ios + i; 1097 | writeback_io->data = writeback_seg->buf + (i << 12); 1098 | } 1099 | 1100 | return writeback_seg; 1101 | 1102 | bad_buf: 1103 | kfree(writeback_seg->ios); 1104 | bad_ios: 1105 | kfree(writeback_seg); 1106 | bad_writeback_seg: 1107 | return NULL; 1108 | } 1109 | 1110 | static void free_writeback_segment(struct wb_device *wb, struct writeback_segment *writeback_seg) 1111 | { 1112 | vfree(writeback_seg->buf); 1113 | kfree(writeback_seg->ios); 1114 | kfree(writeback_seg); 1115 | } 1116 | 1117 | /* 1118 | * Try to allocate new writeback buffer by the @nr_batch size. 1119 | * On success, it frees the old buffer. 1120 | * 1121 | * Bad user may set # of batches that can hardly allocate. 1122 | * This function is even robust in such case. 1123 | */ 1124 | static void free_writeback_ios(struct wb_device *wb) 1125 | { 1126 | size_t i; 1127 | for (i = 0; i < wb->nr_cur_batched_writeback; i++) 1128 | free_writeback_segment(wb, *(wb->writeback_segs + i)); 1129 | kfree(wb->writeback_segs); 1130 | } 1131 | 1132 | /* 1133 | * Request to allocate data structures to write back @nr_batch segments. 1134 | * Previous structures are preserved in case of failure. 1135 | */ 1136 | int try_alloc_writeback_ios(struct wb_device *wb, size_t nr_batch, gfp_t gfp) 1137 | { 1138 | int err = 0; 1139 | size_t i; 1140 | 1141 | struct writeback_segment **writeback_segs = kzalloc( 1142 | nr_batch * sizeof(struct writeback_segment *), gfp); 1143 | if (!writeback_segs) 1144 | return -ENOMEM; 1145 | 1146 | for (i = 0; i < nr_batch; i++) { 1147 | struct writeback_segment *alloced = alloc_writeback_segment(wb, gfp); 1148 | if (!alloced) { 1149 | size_t j; 1150 | for (j = 0; j < i; j++) 1151 | free_writeback_segment(wb, writeback_segs[j]); 1152 | kfree(writeback_segs); 1153 | 1154 | DMERR("Failed to allocate writeback_segs"); 1155 | return -ENOMEM; 1156 | } 1157 | writeback_segs[i] = alloced; 1158 | } 1159 | 1160 | /* 1161 | * Free old buffers if exists. 1162 | * wb->writeback_segs is firstly NULL under constructor .ctr. 1163 | */ 1164 | if (wb->writeback_segs) 1165 | free_writeback_ios(wb); 1166 | 1167 | /* And then swap by new values */ 1168 | wb->writeback_segs = writeback_segs; 1169 | wb->nr_writeback_segs = nr_batch; 1170 | 1171 | return err; 1172 | } 1173 | 1174 | /*----------------------------------------------------------------------------*/ 1175 | 1176 | #define CREATE_DAEMON(name) \ 1177 | do { \ 1178 | wb->name = kthread_create( \ 1179 | name##_proc, wb, "dmwb_" #name); \ 1180 | if (IS_ERR(wb->name)) { \ 1181 | err = PTR_ERR(wb->name); \ 1182 | wb->name = NULL; \ 1183 | DMERR("couldn't spawn " #name); \ 1184 | goto bad_##name; \ 1185 | } \ 1186 | wake_up_process(wb->name); \ 1187 | } while (0) 1188 | 1189 | /* 1190 | * Alloc and then setup the initial state of the metadata 1191 | * 1192 | * Metadata: 1193 | * - Segment header array 1194 | * - Metablocks 1195 | * - Hash table 1196 | */ 1197 | static int init_metadata(struct wb_device *wb) 1198 | { 1199 | int err = 0; 1200 | 1201 | err = init_segment_header_array(wb); 1202 | if (err) { 1203 | DMERR("init_segment_header_array failed"); 1204 | goto bad_alloc_segment_header_array; 1205 | } 1206 | 1207 | err = ht_empty_init(wb); 1208 | if (err) { 1209 | DMERR("ht_empty_init failed"); 1210 | goto bad_alloc_ht; 1211 | } 1212 | 1213 | return err; 1214 | 1215 | bad_alloc_ht: 1216 | free_segment_header_array(wb); 1217 | bad_alloc_segment_header_array: 1218 | return err; 1219 | } 1220 | 1221 | static void free_metadata(struct wb_device *wb) 1222 | { 1223 | free_ht(wb); 1224 | free_segment_header_array(wb); 1225 | } 1226 | 1227 | static int init_writeback_daemon(struct wb_device *wb) 1228 | { 1229 | int err = 0; 1230 | size_t nr_batch; 1231 | 1232 | atomic_set(&wb->writeback_fail_count, 0); 1233 | atomic_set(&wb->writeback_io_count, 0); 1234 | 1235 | nr_batch = 32; 1236 | wb->nr_max_batched_writeback = nr_batch; 1237 | if (try_alloc_writeback_ios(wb, nr_batch, GFP_KERNEL)) 1238 | return -ENOMEM; 1239 | 1240 | init_waitqueue_head(&wb->writeback_wait_queue); 1241 | init_waitqueue_head(&wb->wait_drop_caches); 1242 | init_waitqueue_head(&wb->writeback_io_wait_queue); 1243 | 1244 | wb->allow_writeback = false; 1245 | wb->urge_writeback = false; 1246 | wb->force_drop = false; 1247 | CREATE_DAEMON(writeback_daemon); 1248 | 1249 | return err; 1250 | 1251 | bad_writeback_daemon: 1252 | free_writeback_ios(wb); 1253 | return err; 1254 | } 1255 | 1256 | static int init_flush_daemon(struct wb_device *wb) 1257 | { 1258 | int err = 0; 1259 | init_waitqueue_head(&wb->flush_wait_queue); 1260 | CREATE_DAEMON(flush_daemon); 1261 | return err; 1262 | 1263 | bad_flush_daemon: 1264 | return err; 1265 | } 1266 | 1267 | static int init_flush_barrier_work(struct wb_device *wb) 1268 | { 1269 | wb->barrier_wq = create_singlethread_workqueue("dmwb_barrier"); 1270 | if (!wb->barrier_wq) { 1271 | DMERR("Failed to allocate barrier_wq"); 1272 | return -ENOMEM; 1273 | } 1274 | bio_list_init(&wb->barrier_ios); 1275 | INIT_WORK(&wb->flush_barrier_work, flush_barrier_ios); 1276 | return 0; 1277 | } 1278 | 1279 | static int init_writeback_modulator(struct wb_device *wb) 1280 | { 1281 | int err = 0; 1282 | wb->writeback_threshold = 0; 1283 | CREATE_DAEMON(writeback_modulator); 1284 | return err; 1285 | 1286 | bad_writeback_modulator: 1287 | return err; 1288 | } 1289 | 1290 | static int init_sb_record_updater(struct wb_device *wb) 1291 | { 1292 | int err = 0; 1293 | wb->update_sb_record_interval = 0; 1294 | CREATE_DAEMON(sb_record_updater); 1295 | return err; 1296 | 1297 | bad_sb_record_updater: 1298 | return err; 1299 | } 1300 | 1301 | static int init_data_synchronizer(struct wb_device *wb) 1302 | { 1303 | int err = 0; 1304 | wb->sync_data_interval = 0; 1305 | CREATE_DAEMON(data_synchronizer); 1306 | return err; 1307 | 1308 | bad_data_synchronizer: 1309 | return err; 1310 | } 1311 | 1312 | int resume_cache(struct wb_device *wb) 1313 | { 1314 | int err = 0; 1315 | 1316 | wb->nr_segments = calc_nr_segments(wb->cache_dev, wb); 1317 | wb->nr_caches_inseg = (1 << (SEGMENT_SIZE_ORDER - 3)) - 1; 1318 | wb->nr_caches = wb->nr_segments * wb->nr_caches_inseg; 1319 | 1320 | err = init_devices(wb); 1321 | if (err) 1322 | goto bad_devices; 1323 | 1324 | err = init_metadata(wb); 1325 | if (err) 1326 | goto bad_metadata; 1327 | 1328 | err = init_writeback_daemon(wb); 1329 | if (err) { 1330 | DMERR("init_writeback_daemon failed"); 1331 | goto bad_writeback_daemon; 1332 | } 1333 | 1334 | err = recover_cache(wb); 1335 | if (err) { 1336 | DMERR("recover_cache failed"); 1337 | goto bad_recover; 1338 | } 1339 | 1340 | err = init_flush_daemon(wb); 1341 | if (err) { 1342 | DMERR("init_flush_daemon failed"); 1343 | goto bad_flush_daemon; 1344 | } 1345 | 1346 | err = init_flush_barrier_work(wb); 1347 | if (err) { 1348 | DMERR("init_flush_barrier_work failed"); 1349 | goto bad_flush_barrier_work; 1350 | } 1351 | 1352 | err = init_writeback_modulator(wb); 1353 | if (err) { 1354 | DMERR("init_writeback_modulator failed"); 1355 | goto bad_modulator; 1356 | } 1357 | 1358 | err = init_sb_record_updater(wb); 1359 | if (err) { 1360 | DMERR("init_sb_recorder failed"); 1361 | goto bad_updater; 1362 | } 1363 | 1364 | err = init_data_synchronizer(wb); 1365 | if (err) { 1366 | DMERR("init_data_synchronizer failed"); 1367 | goto bad_synchronizer; 1368 | } 1369 | 1370 | return err; 1371 | 1372 | bad_synchronizer: 1373 | kthread_stop(wb->sb_record_updater); 1374 | bad_updater: 1375 | kthread_stop(wb->writeback_modulator); 1376 | bad_modulator: 1377 | destroy_workqueue(wb->barrier_wq); 1378 | bad_flush_barrier_work: 1379 | kthread_stop(wb->flush_daemon); 1380 | bad_flush_daemon: 1381 | bad_recover: 1382 | kthread_stop(wb->writeback_daemon); 1383 | free_writeback_ios(wb); 1384 | bad_writeback_daemon: 1385 | free_metadata(wb); 1386 | bad_metadata: 1387 | free_devices(wb); 1388 | bad_devices: 1389 | return err; 1390 | } 1391 | 1392 | void free_cache(struct wb_device *wb) 1393 | { 1394 | /* 1395 | * kthread_stop() wakes up the thread. 1396 | * So we don't need to wake them up by ourselves. 1397 | */ 1398 | kthread_stop(wb->data_synchronizer); 1399 | kthread_stop(wb->sb_record_updater); 1400 | kthread_stop(wb->writeback_modulator); 1401 | 1402 | destroy_workqueue(wb->barrier_wq); 1403 | 1404 | kthread_stop(wb->flush_daemon); 1405 | 1406 | kthread_stop(wb->writeback_daemon); 1407 | free_writeback_ios(wb); 1408 | 1409 | free_metadata(wb); 1410 | 1411 | free_devices(wb); 1412 | } 1413 | -------------------------------------------------------------------------------- /src/dm-writeboost-target.c: -------------------------------------------------------------------------------- 1 | /* 2 | * dm-writeboost 3 | * Log-structured Caching for Linux 4 | * 5 | * This file is part of dm-writeboost 6 | * Copyright (C) 2012-2025 Akira Hayakawa 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 2 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License along 19 | * with this program; if not, write to the Free Software Foundation, Inc., 20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 21 | */ 22 | 23 | #include "dm-writeboost.h" 24 | #include "dm-writeboost-metadata.h" 25 | #include "dm-writeboost-daemon.h" 26 | 27 | #include "linux/sort.h" 28 | 29 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,2)) || \ 30 | ((LINUX_VERSION_CODE >= KERNEL_VERSION(6,7,11)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,8,0))) || \ 31 | ((LINUX_VERSION_CODE >= KERNEL_VERSION(6,6,23)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,7,0))) || \ 32 | ((LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,83)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0))) 33 | // Linux commit 6e5f0f6383b4896c7e9b943d84b136149d0f45e9 "dm io: Support IO priority" 34 | // added the IO priority parameter in v6.9-rc1. 35 | #define DM_IO(arg1, arg2, arg3, arg4) dm_io(arg1, arg2, arg3, arg4, IOPRIO_DEFAULT) 36 | #else 37 | #define DM_IO(arg1, arg2, arg3, arg4) dm_io(arg1, arg2, arg3, arg4) 38 | #endif 39 | 40 | /*----------------------------------------------------------------------------*/ 41 | 42 | void do_check_buffer_alignment(void *buf, const char *name, const char *caller) 43 | { 44 | unsigned long addr = (unsigned long) buf; 45 | 46 | if (!IS_ALIGNED(addr, 1 << 9)) { 47 | DMCRIT("@%s in %s is not sector-aligned. I/O buffer must be sector-aligned.", name, caller); 48 | BUG(); 49 | } 50 | } 51 | 52 | /*----------------------------------------------------------------------------*/ 53 | 54 | struct wb_io { 55 | struct work_struct work; 56 | int err; 57 | unsigned long err_bits; 58 | struct dm_io_request *io_req; 59 | unsigned num_regions; 60 | struct dm_io_region *regions; 61 | }; 62 | 63 | static void wb_io_fn(struct work_struct *work) 64 | { 65 | struct wb_io *io = container_of(work, struct wb_io, work); 66 | io->err_bits = 0; 67 | io->err = DM_IO(io->io_req, io->num_regions, io->regions, &io->err_bits); 68 | } 69 | 70 | int wb_io_internal(struct wb_device *wb, struct dm_io_request *io_req, 71 | unsigned num_regions, struct dm_io_region *regions, 72 | unsigned long *err_bits, bool thread, const char *caller) 73 | { 74 | int err = 0; 75 | 76 | if (thread) { 77 | struct wb_io io = { 78 | .io_req = io_req, 79 | .regions = regions, 80 | .num_regions = num_regions, 81 | }; 82 | ASSERT(io_req->notify.fn == NULL); 83 | 84 | INIT_WORK_ONSTACK(&io.work, wb_io_fn); 85 | queue_work(wb->io_wq, &io.work); 86 | flush_workqueue(wb->io_wq); 87 | destroy_work_on_stack(&io.work); /* Pair with INIT_WORK_ONSTACK */ 88 | 89 | err = io.err; 90 | if (err_bits) 91 | *err_bits = io.err_bits; 92 | } else { 93 | err = DM_IO(io_req, num_regions, regions, err_bits); 94 | } 95 | 96 | /* err_bits can be NULL. */ 97 | if (err || (err_bits && *err_bits)) { 98 | char buf[BDEVNAME_SIZE]; 99 | dev_t dev = regions->bdev->bd_dev; 100 | 101 | unsigned long eb; 102 | if (!err_bits) 103 | eb = (~(unsigned long)0); 104 | else 105 | eb = *err_bits; 106 | 107 | format_dev_t(buf, dev); 108 | DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), %s", 109 | caller, err, eb, 110 | buf, (unsigned long long) regions->sector, 111 | req_is_write(io_req) ? "write" : "read"); 112 | } 113 | 114 | return err; 115 | } 116 | 117 | sector_t dm_devsize(struct dm_dev *dev) 118 | { 119 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) 120 | return bdev_nr_sectors(dev->bdev); 121 | #else 122 | return i_size_read(dev->bdev->bd_inode) >> 9; 123 | #endif 124 | } 125 | 126 | /*----------------------------------------------------------------------------*/ 127 | 128 | void bio_io_success_compat(struct bio *bio) 129 | { 130 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) 131 | bio->bi_status = BLK_STS_OK; 132 | bio_endio(bio); 133 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) 134 | bio->bi_error = 0; 135 | bio_endio(bio); 136 | #else 137 | bio_endio(bio, 0); 138 | #endif 139 | } 140 | 141 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) 142 | #define bi_sector(bio) (bio)->bi_iter.bi_sector 143 | #else 144 | #define bi_sector(bio) (bio)->bi_sector 145 | #endif 146 | 147 | static void bio_remap(struct bio *bio, struct dm_dev *dev, sector_t sector) 148 | { 149 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) 150 | bio_set_dev(bio, dev->bdev); 151 | #else 152 | bio->bi_bdev = dev->bdev; 153 | #endif 154 | bi_sector(bio) = sector; 155 | } 156 | 157 | static u8 calc_offset(sector_t sector) 158 | { 159 | u32 tmp32; 160 | div_u64_rem(sector, 1 << 3, &tmp32); 161 | return tmp32; 162 | } 163 | 164 | static u8 bio_calc_offset(struct bio *bio) 165 | { 166 | return calc_offset(bi_sector(bio)); 167 | } 168 | 169 | static bool bio_is_fullsize(struct bio *bio) 170 | { 171 | return bio_sectors(bio) == (1 << 3); 172 | } 173 | 174 | static bool bio_is_write(struct bio *bio) 175 | { 176 | return bio_data_dir(bio) == WRITE; 177 | } 178 | 179 | /* 180 | * We use 4KB alignment address of original request the as the lookup key. 181 | */ 182 | static sector_t calc_cache_alignment(sector_t bio_sector) 183 | { 184 | return div_u64(bio_sector, 1 << 3) * (1 << 3); 185 | } 186 | 187 | /*----------------------------------------------------------------------------*/ 188 | 189 | /* 190 | * Wake up the processes on the wq if the wq is active. 191 | * (At least a process is waiting on it) 192 | * This function should only used for wq that is rarely active. 193 | * Otherwise ordinary wake_up() should be used instead. 194 | */ 195 | static void wake_up_active_wq(wait_queue_head_t *wq) 196 | { 197 | if (unlikely(waitqueue_active(wq))) 198 | wake_up(wq); 199 | } 200 | 201 | /*----------------------------------------------------------------------------*/ 202 | 203 | static u8 count_dirty_caches_remained(struct segment_header *seg) 204 | { 205 | u8 i, count = 0; 206 | struct metablock *mb; 207 | for (i = 0; i < seg->length; i++) { 208 | mb = seg->mb_array + i; 209 | if (mb->dirtiness.is_dirty) 210 | count++; 211 | } 212 | return count; 213 | } 214 | 215 | void inc_nr_dirty_caches(struct wb_device *wb) 216 | { 217 | ASSERT(wb); 218 | atomic64_inc(&wb->nr_dirty_caches); 219 | } 220 | 221 | void dec_nr_dirty_caches(struct wb_device *wb) 222 | { 223 | ASSERT(wb); 224 | if (atomic64_dec_and_test(&wb->nr_dirty_caches)) 225 | wake_up_interruptible(&wb->wait_drop_caches); 226 | } 227 | 228 | static bool taint_mb(struct wb_device *wb, struct metablock *mb, u8 data_bits) 229 | { 230 | unsigned long flags; 231 | bool flipped = false; 232 | 233 | ASSERT(data_bits > 0); 234 | spin_lock_irqsave(&wb->mb_lock, flags); 235 | if (!mb->dirtiness.is_dirty) { 236 | mb->dirtiness.is_dirty = true; 237 | flipped = true; 238 | } 239 | mb->dirtiness.data_bits |= data_bits; 240 | spin_unlock_irqrestore(&wb->mb_lock, flags); 241 | 242 | return flipped; 243 | } 244 | 245 | bool mark_clean_mb(struct wb_device *wb, struct metablock *mb) 246 | { 247 | unsigned long flags; 248 | bool flipped = false; 249 | 250 | spin_lock_irqsave(&wb->mb_lock, flags); 251 | if (mb->dirtiness.is_dirty) { 252 | mb->dirtiness.is_dirty = false; 253 | flipped = true; 254 | } 255 | spin_unlock_irqrestore(&wb->mb_lock, flags); 256 | 257 | return flipped; 258 | } 259 | 260 | /* 261 | * Read the dirtiness of a metablock at the moment. 262 | */ 263 | struct dirtiness read_mb_dirtiness(struct wb_device *wb, struct segment_header *seg, 264 | struct metablock *mb) 265 | { 266 | unsigned long flags; 267 | struct dirtiness retval; 268 | 269 | spin_lock_irqsave(&wb->mb_lock, flags); 270 | retval = mb->dirtiness; 271 | spin_unlock_irqrestore(&wb->mb_lock, flags); 272 | 273 | return retval; 274 | } 275 | 276 | /*----------------------------------------------------------------------------*/ 277 | 278 | void cursor_init(struct wb_device *wb) 279 | { 280 | wb->cursor = wb->current_seg->start_idx; 281 | wb->current_seg->length = 0; 282 | } 283 | 284 | /* 285 | * Advance the cursor and return the old cursor. 286 | * After returned, nr_inflight_ios is incremented to wait for this write to complete. 287 | */ 288 | static u32 advance_cursor(struct wb_device *wb) 289 | { 290 | u32 old; 291 | if (wb->cursor == wb->nr_caches) 292 | wb->cursor = 0; 293 | old = wb->cursor; 294 | wb->cursor++; 295 | wb->current_seg->length++; 296 | BUG_ON(wb->current_seg->length > wb->nr_caches_inseg); 297 | atomic_inc(&wb->current_seg->nr_inflight_ios); 298 | return old; 299 | } 300 | 301 | static bool needs_queue_seg(struct wb_device *wb) 302 | { 303 | bool rambuf_no_space = !mb_idx_inseg(wb, wb->cursor); 304 | return rambuf_no_space; 305 | } 306 | 307 | /*----------------------------------------------------------------------------*/ 308 | 309 | static void copy_barrier_requests(struct rambuffer *rambuf, struct wb_device *wb) 310 | { 311 | bio_list_init(&rambuf->barrier_ios); 312 | bio_list_merge(&rambuf->barrier_ios, &wb->barrier_ios); 313 | bio_list_init(&wb->barrier_ios); 314 | } 315 | 316 | static void prepare_rambuffer(struct rambuffer *rambuf, 317 | struct wb_device *wb, 318 | struct segment_header *seg) 319 | { 320 | rambuf->seg = seg; 321 | prepare_segment_header_device(rambuf->data, wb, seg); 322 | copy_barrier_requests(rambuf, wb); 323 | } 324 | 325 | static void init_rambuffer(struct wb_device *wb) 326 | { 327 | memset(wb->current_rambuf->data, 0, 1 << 12); 328 | } 329 | 330 | /* 331 | * Acquire a new RAM buffer for the new segment. 332 | */ 333 | static void __acquire_new_rambuffer(struct wb_device *wb, u64 id) 334 | { 335 | wait_for_flushing(wb, SUB_ID(id, NR_RAMBUF_POOL)); 336 | 337 | wb->current_rambuf = get_rambuffer_by_id(wb, id); 338 | 339 | init_rambuffer(wb); 340 | } 341 | 342 | static void __acquire_new_seg(struct wb_device *wb, u64 id) 343 | { 344 | struct segment_header *new_seg = get_segment_header_by_id(wb, id); 345 | 346 | /* 347 | * We wait for all requests to the new segment is consumed. 348 | * Mutex taken guarantees that no new I/O to this segment is coming in. 349 | */ 350 | wait_event(wb->inflight_ios_wq, 351 | !atomic_read(&new_seg->nr_inflight_ios)); 352 | 353 | wait_for_writeback(wb, SUB_ID(id, wb->nr_segments)); 354 | if (count_dirty_caches_remained(new_seg)) { 355 | DMERR("%u dirty caches remained. id:%llu", 356 | count_dirty_caches_remained(new_seg), id); 357 | BUG(); 358 | } 359 | discard_caches_inseg(wb, new_seg); 360 | 361 | /* 362 | * We mustn't set new id to the new segment before 363 | * all wait_* events are done since they uses those id for waiting. 364 | */ 365 | new_seg->id = id; 366 | wb->current_seg = new_seg; 367 | } 368 | 369 | /* 370 | * Acquire the new segment and RAM buffer for the following writes. 371 | * Guarantees all dirty caches in the segments are written back and 372 | * all metablocks in it are invalidated (Linked to null head). 373 | */ 374 | void acquire_new_seg(struct wb_device *wb, u64 id) 375 | { 376 | __acquire_new_rambuffer(wb, id); 377 | __acquire_new_seg(wb, id); 378 | } 379 | 380 | static void prepare_new_seg(struct wb_device *wb) 381 | { 382 | u64 next_id = wb->current_seg->id + 1; 383 | acquire_new_seg(wb, next_id); 384 | cursor_init(wb); 385 | } 386 | 387 | /*----------------------------------------------------------------------------*/ 388 | 389 | static void queue_flush_job(struct wb_device *wb) 390 | { 391 | wait_event(wb->inflight_ios_wq, !atomic_read(&wb->current_seg->nr_inflight_ios)); 392 | 393 | prepare_rambuffer(wb->current_rambuf, wb, wb->current_seg); 394 | 395 | smp_wmb(); 396 | atomic64_inc(&wb->last_queued_segment_id); 397 | wake_up_process(wb->flush_daemon); 398 | } 399 | 400 | static void queue_current_buffer(struct wb_device *wb) 401 | { 402 | queue_flush_job(wb); 403 | prepare_new_seg(wb); 404 | } 405 | 406 | /* 407 | * queue_current_buffer if the RAM buffer can't make space any more. 408 | */ 409 | static void might_queue_current_buffer(struct wb_device *wb) 410 | { 411 | if (needs_queue_seg(wb)) { 412 | update_nr_empty_segs(wb); 413 | queue_current_buffer(wb); 414 | } 415 | } 416 | 417 | /* 418 | * Flush out all the transient data at a moment but _NOT_ persistently. 419 | */ 420 | void flush_current_buffer(struct wb_device *wb) 421 | { 422 | struct segment_header *old_seg; 423 | 424 | mutex_lock(&wb->io_lock); 425 | old_seg = wb->current_seg; 426 | 427 | queue_current_buffer(wb); 428 | mutex_unlock(&wb->io_lock); 429 | 430 | wait_for_flushing(wb, old_seg->id); 431 | } 432 | 433 | /*----------------------------------------------------------------------------*/ 434 | 435 | static void inc_stat(struct wb_device *wb, 436 | int rw, bool found, bool on_buffer, bool fullsize) 437 | { 438 | atomic64_t *v; 439 | 440 | int i = 0; 441 | if (rw) 442 | i |= (1 << WB_STAT_WRITE); 443 | if (found) 444 | i |= (1 << WB_STAT_HIT); 445 | if (on_buffer) 446 | i |= (1 << WB_STAT_ON_BUFFER); 447 | if (fullsize) 448 | i |= (1 << WB_STAT_FULLSIZE); 449 | 450 | v = &wb->stat[i]; 451 | atomic64_inc(v); 452 | } 453 | 454 | static void clear_stat(struct wb_device *wb) 455 | { 456 | size_t i; 457 | for (i = 0; i < STATLEN; i++) { 458 | atomic64_t *v = &wb->stat[i]; 459 | atomic64_set(v, 0); 460 | } 461 | atomic64_set(&wb->count_non_full_flushed, 0); 462 | } 463 | 464 | /*----------------------------------------------------------------------------*/ 465 | 466 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) 467 | #define bv_vec struct bio_vec 468 | #define bv_page(vec) vec.bv_page 469 | #define bv_offset(vec) vec.bv_offset 470 | #define bv_len(vec) vec.bv_len 471 | #define bv_it struct bvec_iter 472 | #else 473 | #define bv_vec struct bio_vec * 474 | #define bv_page(vec) vec->bv_page 475 | #define bv_offset(vec) vec->bv_offset 476 | #define bv_len(vec) vec->bv_len 477 | #define bv_it int 478 | #endif 479 | 480 | /* 481 | * Incoming bio may have multiple bio vecs as a result bvec merging. 482 | * We shouldn't use bio_data directly to access to whole payload but 483 | * should iterate over the vector. 484 | */ 485 | static void copy_bio_payload(void *buf, struct bio *bio) 486 | { 487 | size_t sum = 0; 488 | bv_vec vec; 489 | bv_it it; 490 | bio_for_each_segment(vec, bio, it) { 491 | void *dst = kmap_atomic(bv_page(vec)); 492 | size_t l = bv_len(vec); 493 | memcpy(buf, dst + bv_offset(vec), l); 494 | kunmap_atomic(dst); 495 | buf += l; 496 | sum += l; 497 | } 498 | ASSERT(sum == (bio_sectors(bio) << 9)); 499 | } 500 | 501 | /* 502 | * Copy 512B buffer data to bio payload's i-th 512B area. 503 | */ 504 | static void __copy_to_bio_payload(struct bio *bio, void *buf, u8 i) 505 | { 506 | size_t head = 0; 507 | size_t tail = head; 508 | 509 | bv_vec vec; 510 | bv_it it; 511 | bio_for_each_segment(vec, bio, it) { 512 | size_t l = bv_len(vec); 513 | tail += l; 514 | if ((i << 9) < tail) { 515 | void *dst = kmap_atomic(bv_page(vec)); 516 | size_t offset = (i << 9) - head; 517 | BUG_ON((l - offset) < (1 << 9)); 518 | memcpy(dst + bv_offset(vec) + offset, buf, 1 << 9); 519 | kunmap_atomic(dst); 520 | return; 521 | } 522 | head += l; 523 | } 524 | BUG(); 525 | } 526 | 527 | /* 528 | * Copy 4KB buffer to bio payload with care to bio offset and copy bits. 529 | */ 530 | static void copy_to_bio_payload(struct bio *bio, void *buf, u8 copy_bits) 531 | { 532 | u8 offset = bio_calc_offset(bio); 533 | u8 i; 534 | for (i = 0; i < bio_sectors(bio); i++) { 535 | u8 i_offset = i + offset; 536 | if (copy_bits & (1 << i_offset)) 537 | __copy_to_bio_payload(bio, buf + (i_offset << 9), i); 538 | } 539 | } 540 | 541 | /*----------------------------------------------------------------------------*/ 542 | 543 | struct lookup_result { 544 | struct ht_head *head; /* Lookup head used */ 545 | struct lookup_key key; /* Lookup key used */ 546 | 547 | struct segment_header *found_seg; 548 | struct metablock *found_mb; 549 | 550 | bool found; /* Cache hit? */ 551 | bool on_buffer; /* Is the metablock found on the RAM buffer? */ 552 | }; 553 | 554 | /* 555 | * Lookup a bio relevant cache data. 556 | * In case of cache hit, nr_inflight_ios is incremented. 557 | */ 558 | static void cache_lookup(struct wb_device *wb, struct bio *bio, struct lookup_result *res) 559 | { 560 | res->key = (struct lookup_key) { 561 | .sector = calc_cache_alignment(bi_sector(bio)), 562 | }; 563 | res->head = ht_get_head(wb, &res->key); 564 | 565 | res->found_mb = ht_lookup(wb, res->head, &res->key); 566 | if (res->found_mb) { 567 | res->found_seg = mb_to_seg(wb, res->found_mb); 568 | atomic_inc(&res->found_seg->nr_inflight_ios); 569 | } 570 | 571 | res->found = (res->found_mb != NULL); 572 | 573 | res->on_buffer = false; 574 | if (res->found) 575 | res->on_buffer = is_on_buffer(wb, res->found_mb->idx); 576 | 577 | inc_stat(wb, bio_is_write(bio), res->found, res->on_buffer, bio_is_fullsize(bio)); 578 | } 579 | 580 | static void dec_inflight_ios(struct wb_device *wb, struct segment_header *seg) 581 | { 582 | if (atomic_dec_and_test(&seg->nr_inflight_ios)) 583 | wake_up_active_wq(&wb->inflight_ios_wq); 584 | } 585 | 586 | /*----------------------------------------------------------------------------*/ 587 | 588 | static u8 to_mask(u8 offset, u8 count) 589 | { 590 | u8 i; 591 | u8 result = 0; 592 | if (count == 8) { 593 | result = 255; 594 | } else { 595 | for (i = 0; i < count; i++) 596 | result |= (1 << (i + offset)); 597 | } 598 | return result; 599 | } 600 | 601 | static int fill_payload_by_backing(struct wb_device *wb, struct bio *bio) 602 | { 603 | struct dm_io_request io_req; 604 | struct dm_io_region region; 605 | 606 | sector_t start = bi_sector(bio); 607 | u8 offset = calc_offset(start); 608 | u8 len = bio_sectors(bio); 609 | u8 copy_bits = to_mask(offset, len); 610 | 611 | int err = 0; 612 | void *buf = mempool_alloc(wb->buf_8_pool, GFP_NOIO); 613 | if (!buf) 614 | return -ENOMEM; 615 | 616 | io_req = (struct dm_io_request) { 617 | WB_IO_READ, 618 | .client = wb->io_client, 619 | .notify.fn = NULL, 620 | .mem.type = DM_IO_KMEM, 621 | .mem.ptr.addr = buf + (offset << 9), 622 | }; 623 | region = (struct dm_io_region) { 624 | .bdev = wb->backing_dev->bdev, 625 | .sector = start, 626 | .count = len, 627 | }; 628 | err = wb_io(&io_req, 1, ®ion, NULL, true); 629 | if (err) 630 | goto bad; 631 | 632 | copy_to_bio_payload(bio, buf, copy_bits); 633 | bad: 634 | mempool_free(buf, wb->buf_8_pool); 635 | return err; 636 | } 637 | 638 | /* 639 | * Get the reference to the 4KB-aligned data in RAM buffer. 640 | * Since it only takes the reference caller need not to free the pointer. 641 | */ 642 | static void *ref_buffered_mb(struct wb_device *wb, struct metablock *mb) 643 | { 644 | sector_t offset = ((mb_idx_inseg(wb, mb->idx) + 1) << 3); 645 | return wb->current_rambuf->data + (offset << 9); 646 | } 647 | 648 | /* 649 | * Read cache block of the mb. 650 | * Caller should free the returned pointer after used by mempool_alloc(). 651 | */ 652 | static void *read_mb(struct wb_device *wb, struct segment_header *seg, 653 | struct metablock *mb, u8 data_bits) 654 | { 655 | u8 i; 656 | void *result = mempool_alloc(wb->buf_8_pool, GFP_NOIO); 657 | if (!result) 658 | return NULL; 659 | 660 | for (i = 0; i < 8; i++) { 661 | int err = 0; 662 | struct dm_io_request io_req; 663 | struct dm_io_region region; 664 | 665 | if (!(data_bits & (1 << i))) 666 | continue; 667 | 668 | io_req = (struct dm_io_request) { 669 | WB_IO_READ, 670 | .client = wb->io_client, 671 | .notify.fn = NULL, 672 | .mem.type = DM_IO_KMEM, 673 | .mem.ptr.addr = result + (i << 9), 674 | }; 675 | 676 | region = (struct dm_io_region) { 677 | .bdev = wb->cache_dev->bdev, 678 | .sector = calc_mb_start_sector(wb, seg, mb->idx) + i, 679 | .count = 1, 680 | }; 681 | 682 | err = wb_io(&io_req, 1, ®ion, NULL, true); 683 | if (err) { 684 | mempool_free(result, wb->buf_8_pool); 685 | return NULL; 686 | } 687 | } 688 | return result; 689 | } 690 | 691 | /*----------------------------------------------------------------------------*/ 692 | 693 | enum PBD_FLAG { 694 | PBD_NONE = 0, 695 | PBD_WILL_CACHE = 1, 696 | PBD_READ_SEG = 2, 697 | }; 698 | 699 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) 700 | #define PER_BIO_DATA_SIZE per_io_data_size 701 | #else 702 | #define PER_BIO_DATA_SIZE per_bio_data_size 703 | #endif 704 | struct per_bio_data { 705 | enum PBD_FLAG type; 706 | union { 707 | u32 cell_idx; 708 | struct segment_header *seg; 709 | }; 710 | }; 711 | #define per_bio_data(bio) ((struct per_bio_data *) dm_per_bio_data((bio), sizeof(struct per_bio_data))) 712 | 713 | /*----------------------------------------------------------------------------*/ 714 | 715 | #define read_cache_cell_from_node(node) rb_entry((node), struct read_cache_cell, rb_node) 716 | 717 | static void read_cache_add(struct read_cache_cells *cells, struct read_cache_cell *cell) 718 | { 719 | struct rb_node **rbp, *parent; 720 | rbp = &cells->rb_root.rb_node; 721 | parent = NULL; 722 | while (*rbp) { 723 | struct read_cache_cell *parent_cell; 724 | parent = *rbp; 725 | parent_cell = read_cache_cell_from_node(parent); 726 | if (cell->sector < parent_cell->sector) 727 | rbp = &(*rbp)->rb_left; 728 | else 729 | rbp = &(*rbp)->rb_right; 730 | } 731 | rb_link_node(&cell->rb_node, parent, rbp); 732 | rb_insert_color(&cell->rb_node, &cells->rb_root); 733 | } 734 | 735 | static struct read_cache_cell *lookup_read_cache_cell(struct read_cache_cells *cells, sector_t sector) 736 | { 737 | struct rb_node **rbp, *parent; 738 | rbp = &cells->rb_root.rb_node; 739 | parent = NULL; 740 | while (*rbp) { 741 | struct read_cache_cell *parent_cell; 742 | parent = *rbp; 743 | parent_cell = read_cache_cell_from_node(parent); 744 | if (parent_cell->sector == sector) 745 | return parent_cell; 746 | 747 | if (sector < parent_cell->sector) 748 | rbp = &(*rbp)->rb_left; 749 | else 750 | rbp = &(*rbp)->rb_right; 751 | } 752 | return NULL; 753 | } 754 | 755 | /* 756 | * Cancel all cells in [cursor, cursor + seqcount). 757 | */ 758 | static void read_cache_cancel_seq_cells(struct read_cache_cells *cells) 759 | { 760 | u32 i; 761 | u32 last = cells->cursor + cells->seqcount; 762 | if (last > cells->size) 763 | last = cells->size; 764 | for (i = cells->cursor; i < last; i++) { 765 | struct read_cache_cell *cell = cells->array + i; 766 | atomic_set(&cell->cancelled, 1); 767 | } 768 | } 769 | 770 | /* 771 | * Track the forefront read address and cancel cells in case of over threshold. 772 | * If the cell is cancelled foreground, we can save the memory copy in the background. 773 | */ 774 | static void read_cache_cancel_foreground(struct read_cache_cells *cells, 775 | struct read_cache_cell *new_cell) 776 | { 777 | if (new_cell->sector == (cells->last_sector + 8)) 778 | cells->seqcount++; 779 | else { 780 | cells->seqcount = 1; 781 | cells->over_threshold = false; 782 | } 783 | cells->last_sector = new_cell->sector; 784 | 785 | if (cells->seqcount > cells->threshold) { 786 | if (cells->over_threshold) 787 | atomic_set(&new_cell->cancelled, 1); 788 | else { 789 | cells->over_threshold = true; 790 | read_cache_cancel_seq_cells(cells); 791 | } 792 | } 793 | } 794 | 795 | static bool do_reserve_read_cache_cell(struct read_cache_cells *cells, struct bio *bio) 796 | { 797 | struct read_cache_cell *found, *new_cell; 798 | 799 | ASSERT(cells->threshold > 0); 800 | if (!cells->cursor) 801 | return false; 802 | 803 | /* 804 | * We don't need to reserve the same address twice 805 | * because it's either unchanged or invalidated. 806 | */ 807 | found = lookup_read_cache_cell(cells, bi_sector(bio)); 808 | if (found) 809 | return false; 810 | 811 | cells->cursor--; 812 | new_cell = cells->array + cells->cursor; 813 | new_cell->sector = bi_sector(bio); 814 | read_cache_add(cells, new_cell); 815 | 816 | /* Cancel the new_cell if needed */ 817 | read_cache_cancel_foreground(cells, new_cell); 818 | 819 | return true; 820 | } 821 | 822 | static bool reserve_read_cache_cell(struct wb_device *wb, struct bio *bio) 823 | { 824 | struct per_bio_data *pbd; 825 | struct read_cache_cells *cells = wb->read_cache_cells; 826 | bool reserved; 827 | 828 | if (!read_once(wb->read_cache_threshold)) 829 | return false; 830 | 831 | /* 832 | * We only cache 4KB read data for following reasons: 833 | * 1) Caching partial data (< 4KB) is likely meaningless. 834 | * 2) Caching partial data makes the read-caching mechanism very hard. 835 | */ 836 | if (!bio_is_fullsize(bio)) 837 | return false; 838 | 839 | mutex_lock(&cells->lock); 840 | reserved = do_reserve_read_cache_cell(cells, bio); 841 | mutex_unlock(&cells->lock); 842 | 843 | if (!reserved) 844 | return false; 845 | 846 | pbd = per_bio_data(bio); 847 | pbd->type = PBD_WILL_CACHE; 848 | pbd->cell_idx = cells->cursor; 849 | 850 | return true; 851 | } 852 | 853 | static void might_cancel_read_cache_cell(struct wb_device *wb, struct bio *bio) 854 | { 855 | struct read_cache_cell *found; 856 | struct read_cache_cells *cells = wb->read_cache_cells; 857 | 858 | mutex_lock(&cells->lock); 859 | found = lookup_read_cache_cell(cells, calc_cache_alignment(bi_sector(bio))); 860 | mutex_unlock(&cells->lock); 861 | 862 | if (found) 863 | atomic_set(&found->cancelled, 1); 864 | } 865 | 866 | static void read_cache_cell_copy_data(struct wb_device *wb, struct bio *bio, unsigned long error) 867 | { 868 | struct per_bio_data *pbd = per_bio_data(bio); 869 | struct read_cache_cells *cells = wb->read_cache_cells; 870 | struct read_cache_cell *cell = cells->array + pbd->cell_idx; 871 | 872 | ASSERT(pbd->type == PBD_WILL_CACHE); 873 | 874 | /* Data can be broken. So don't stage. */ 875 | if (error) 876 | atomic_set(&cell->cancelled, 1); 877 | 878 | /* 879 | * We can omit copying if the cell is cancelled but 880 | * copying for a non-cancelled cell isn't problematic. 881 | */ 882 | if (!atomic_read(&cell->cancelled)) 883 | copy_bio_payload(cell->data, bio); 884 | 885 | if (atomic_dec_and_test(&cells->ack_count)) 886 | queue_work(cells->wq, &wb->read_cache_work); 887 | } 888 | 889 | /* 890 | * Get a read cache cell through simplified write path if the cell data isn't stale. 891 | */ 892 | static void inject_read_cache(struct wb_device *wb, struct read_cache_cell *cell) 893 | { 894 | struct metablock *mb; 895 | u32 _mb_idx_inseg; 896 | struct segment_header *seg; 897 | 898 | struct lookup_key key = { 899 | .sector = cell->sector, 900 | }; 901 | struct ht_head *head = ht_get_head(wb, &key); 902 | 903 | mutex_lock(&wb->io_lock); 904 | /* 905 | * if might_cancel_read_cache_cell() on the foreground 906 | * cancelled this cell, the data is now stale. 907 | */ 908 | if (atomic_read(&cell->cancelled)) { 909 | mutex_unlock(&wb->io_lock); 910 | return; 911 | } 912 | 913 | might_queue_current_buffer(wb); 914 | 915 | seg = wb->current_seg; 916 | _mb_idx_inseg = mb_idx_inseg(wb, advance_cursor(wb)); 917 | 918 | /* 919 | * We should copy the cell data into the rambuf with lock held 920 | * otherwise subsequent write data may be written first and then overwritten by 921 | * the old data in the cell. 922 | */ 923 | memcpy(wb->current_rambuf->data + ((_mb_idx_inseg + 1) << 12), cell->data, 1 << 12); 924 | 925 | mb = seg->mb_array + _mb_idx_inseg; 926 | ASSERT(!mb->dirtiness.is_dirty); 927 | mb->dirtiness.data_bits = 255; 928 | 929 | ht_register(wb, head, mb, &key); 930 | 931 | mutex_unlock(&wb->io_lock); 932 | 933 | dec_inflight_ios(wb, seg); 934 | } 935 | 936 | static void free_read_cache_cell_data(struct read_cache_cells *cells) 937 | { 938 | u32 i; 939 | for (i = 0; i < cells->size; i++) { 940 | struct read_cache_cell *cell = cells->array + i; 941 | vfree(cell->data); 942 | } 943 | } 944 | 945 | static struct read_cache_cells *alloc_read_cache_cells(struct wb_device *wb, u32 n) 946 | { 947 | struct read_cache_cells *cells; 948 | u32 i; 949 | cells = kmalloc(sizeof(struct read_cache_cells), GFP_KERNEL); 950 | if (!cells) 951 | return NULL; 952 | 953 | mutex_init(&cells->lock); 954 | cells->size = n; 955 | cells->threshold = UINT_MAX; /* Default: every read will be cached */ 956 | cells->last_sector = ~0; 957 | cells->seqcount = 0; 958 | cells->over_threshold = false; 959 | cells->array = kmalloc(sizeof(struct read_cache_cell) * n, GFP_KERNEL); 960 | if (!cells->array) 961 | goto bad_cells_array; 962 | 963 | for (i = 0; i < cells->size; i++) { 964 | struct read_cache_cell *cell = cells->array + i; 965 | cell->data = vmalloc(1 << 12); 966 | if (!cell->data) { 967 | u32 j; 968 | for (j = 0; j < i; j++) { 969 | cell = cells->array + j; 970 | vfree(cell->data); 971 | } 972 | goto bad_cell_data; 973 | } 974 | } 975 | 976 | cells->wq = create_singlethread_workqueue("dmwb_read_cache"); 977 | if (!cells->wq) 978 | goto bad_wq; 979 | 980 | return cells; 981 | 982 | bad_wq: 983 | free_read_cache_cell_data(cells); 984 | bad_cell_data: 985 | kfree(cells->array); 986 | bad_cells_array: 987 | kfree(cells); 988 | return NULL; 989 | } 990 | 991 | static void free_read_cache_cells(struct wb_device *wb) 992 | { 993 | struct read_cache_cells *cells = wb->read_cache_cells; 994 | destroy_workqueue(cells->wq); /* This drains wq. So, must precede the others */ 995 | free_read_cache_cell_data(cells); 996 | kfree(cells->array); 997 | kfree(cells); 998 | } 999 | 1000 | static void reinit_read_cache_cells(struct read_cache_cells *cells, u32 new_threshold) 1001 | { 1002 | u32 i; 1003 | 1004 | cells->rb_root = RB_ROOT; 1005 | cells->cursor = cells->size; 1006 | atomic_set(&cells->ack_count, cells->size); 1007 | for (i = 0; i < cells->size; i++) { 1008 | struct read_cache_cell *cell = cells->array + i; 1009 | atomic_set(&cell->cancelled, 0); 1010 | } 1011 | if (new_threshold && (new_threshold != cells->threshold)) { 1012 | cells->threshold = new_threshold; 1013 | cells->over_threshold = false; 1014 | } 1015 | } 1016 | 1017 | /* 1018 | * Cancel cells [first, last) 1019 | */ 1020 | static void visit_and_cancel_cells(struct rb_node *first, struct rb_node *last) 1021 | { 1022 | struct rb_node *rbp = first; 1023 | while (rbp != last) { 1024 | struct read_cache_cell *cell = read_cache_cell_from_node(rbp); 1025 | atomic_set(&cell->cancelled, 1); 1026 | rbp = rb_next(rbp); 1027 | } 1028 | } 1029 | 1030 | /* 1031 | * Find out sequence from cells and cancel them if larger than threshold. 1032 | */ 1033 | static void read_cache_cancel_background(struct read_cache_cells *cells) 1034 | { 1035 | struct rb_node *rbp = rb_first(&cells->rb_root); 1036 | struct rb_node *seqhead = rbp; 1037 | sector_t last_sector = ~0; 1038 | u32 seqcount = 0; 1039 | 1040 | while (rbp) { 1041 | struct read_cache_cell *cell = read_cache_cell_from_node(rbp); 1042 | if (cell->sector == (last_sector + 8)) 1043 | seqcount++; 1044 | else { 1045 | if (seqcount > cells->threshold) 1046 | visit_and_cancel_cells(seqhead, rbp); 1047 | seqcount = 1; 1048 | seqhead = rbp; 1049 | } 1050 | last_sector = cell->sector; 1051 | rbp = rb_next(rbp); 1052 | } 1053 | if (seqcount > cells->threshold) 1054 | visit_and_cancel_cells(seqhead, rbp); 1055 | } 1056 | 1057 | static void read_cache_proc(struct work_struct *work) 1058 | { 1059 | struct wb_device *wb = container_of(work, struct wb_device, read_cache_work); 1060 | struct read_cache_cells *cells = wb->read_cache_cells; 1061 | u32 i; 1062 | 1063 | read_cache_cancel_background(cells); 1064 | 1065 | for (i = 0; i < cells->size; i++) { 1066 | struct read_cache_cell *cell = cells->array + i; 1067 | inject_read_cache(wb, cell); 1068 | } 1069 | 1070 | mutex_lock(&cells->lock); 1071 | reinit_read_cache_cells(cells, read_once(wb->read_cache_threshold)); 1072 | mutex_unlock(&cells->lock); 1073 | } 1074 | 1075 | static int init_read_cache_cells(struct wb_device *wb) 1076 | { 1077 | struct read_cache_cells *cells; 1078 | INIT_WORK(&wb->read_cache_work, read_cache_proc); 1079 | cells = alloc_read_cache_cells(wb, wb->nr_read_cache_cells); 1080 | if (!cells) 1081 | return -ENOMEM; 1082 | wb->read_cache_cells = cells; 1083 | reinit_read_cache_cells(cells, wb->read_cache_threshold); 1084 | return 0; 1085 | } 1086 | 1087 | /*----------------------------------------------------------------------------*/ 1088 | 1089 | static void initialize_write_io(struct write_io *wio, struct bio *bio) 1090 | { 1091 | u8 offset = bio_calc_offset(bio); 1092 | sector_t count = bio_sectors(bio); 1093 | copy_bio_payload(wio->data + (offset << 9), bio); 1094 | wio->data_bits = to_mask(offset, count); 1095 | } 1096 | 1097 | static void memcpy_masked(void *to, u8 protect_bits, void *from, u8 copy_bits) 1098 | { 1099 | u8 i; 1100 | for (i = 0; i < 8; i++) { 1101 | bool will_copy = copy_bits & (1 << i); 1102 | bool protected = protect_bits & (1 << i); 1103 | if (will_copy && (!protected)) { 1104 | size_t offset = (i << 9); 1105 | memcpy(to + offset, from + offset, 1 << 9); 1106 | } 1107 | } 1108 | } 1109 | 1110 | int prepare_overwrite(struct wb_device *wb, struct segment_header *seg, struct metablock *old_mb, struct write_io* wio, u8 overwrite_bits) 1111 | { 1112 | struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, old_mb); 1113 | 1114 | bool needs_merge_prev_cache = !(overwrite_bits == 255) || !(dirtiness.data_bits == 255); 1115 | 1116 | if (!dirtiness.is_dirty) 1117 | needs_merge_prev_cache = false; 1118 | 1119 | if (overwrite_bits == 255) 1120 | needs_merge_prev_cache = false; 1121 | 1122 | if (unlikely(needs_merge_prev_cache)) { 1123 | void *buf; 1124 | 1125 | wait_for_flushing(wb, seg->id); 1126 | ASSERT(dirtiness.is_dirty); 1127 | 1128 | buf = read_mb(wb, seg, old_mb, dirtiness.data_bits); 1129 | if (!buf) 1130 | return -EIO; 1131 | 1132 | /* newer data should be prioritized */ 1133 | memcpy_masked(wio->data, wio->data_bits, buf, dirtiness.data_bits); 1134 | wio->data_bits |= dirtiness.data_bits; 1135 | mempool_free(buf, wb->buf_8_pool); 1136 | } 1137 | 1138 | if (mark_clean_mb(wb, old_mb)) 1139 | dec_nr_dirty_caches(wb); 1140 | 1141 | ht_del(wb, old_mb); 1142 | 1143 | return 0; 1144 | } 1145 | 1146 | /* 1147 | * Get a new place to write. 1148 | */ 1149 | static struct metablock *prepare_new_write_pos(struct wb_device *wb) 1150 | { 1151 | struct metablock *ret = wb->current_seg->mb_array + mb_idx_inseg(wb, advance_cursor(wb)); 1152 | ASSERT(!ret->dirtiness.is_dirty); 1153 | ret->dirtiness.data_bits = 0; 1154 | return ret; 1155 | } 1156 | 1157 | static void write_on_rambuffer(struct wb_device *wb, struct metablock *write_pos, struct write_io *wio) 1158 | { 1159 | size_t mb_offset = (mb_idx_inseg(wb, write_pos->idx) + 1) << 12; 1160 | void *mb_data = wb->current_rambuf->data + mb_offset; 1161 | if (wio->data_bits == 255) 1162 | memcpy(mb_data, wio->data, 1 << 12); 1163 | else 1164 | memcpy_masked(mb_data, 0, wio->data, wio->data_bits); 1165 | } 1166 | 1167 | static int do_process_write(struct wb_device *wb, struct bio *bio) 1168 | { 1169 | int err = 0; 1170 | 1171 | struct metablock *write_pos = NULL; 1172 | struct lookup_result res; 1173 | 1174 | struct write_io wio; 1175 | wio.data = mempool_alloc(wb->buf_8_pool, GFP_NOIO); 1176 | if (!wio.data) 1177 | return -ENOMEM; 1178 | initialize_write_io(&wio, bio); 1179 | 1180 | mutex_lock(&wb->io_lock); 1181 | 1182 | cache_lookup(wb, bio, &res); 1183 | if (res.found) { 1184 | if (unlikely(res.on_buffer)) { 1185 | write_pos = res.found_mb; 1186 | goto do_write; 1187 | } else { 1188 | err = prepare_overwrite(wb, res.found_seg, res.found_mb, &wio, wio.data_bits); 1189 | dec_inflight_ios(wb, res.found_seg); 1190 | if (err) 1191 | goto out; 1192 | } 1193 | } 1194 | might_cancel_read_cache_cell(wb, bio); 1195 | 1196 | might_queue_current_buffer(wb); 1197 | 1198 | write_pos = prepare_new_write_pos(wb); 1199 | 1200 | do_write: 1201 | ASSERT(write_pos); 1202 | write_on_rambuffer(wb, write_pos, &wio); 1203 | 1204 | if (taint_mb(wb, write_pos, wio.data_bits)) 1205 | inc_nr_dirty_caches(wb); 1206 | 1207 | ht_register(wb, res.head, write_pos, &res.key); 1208 | 1209 | out: 1210 | mutex_unlock(&wb->io_lock); 1211 | mempool_free(wio.data, wb->buf_8_pool); 1212 | return err; 1213 | } 1214 | 1215 | static int complete_process_write(struct wb_device *wb, struct bio *bio) 1216 | { 1217 | dec_inflight_ios(wb, wb->current_seg); 1218 | 1219 | /* 1220 | * bio with FUA flag has data. 1221 | * We first handle it as a normal write bio and then as a barrier bio. 1222 | */ 1223 | if (bio_is_fua(bio)) { 1224 | queue_barrier_io(wb, bio); 1225 | return DM_MAPIO_SUBMITTED; 1226 | } 1227 | 1228 | bio_io_success_compat(bio); 1229 | return DM_MAPIO_SUBMITTED; 1230 | } 1231 | 1232 | /* 1233 | * (Locking) Dirtiness of a metablock 1234 | * ---------------------------------- 1235 | * A cache data is placed either on RAM buffer or SSD if it was flushed. 1236 | * To make locking easy, we simplify the rule for the dirtiness of a cache data. 1237 | * 1) If the data is on the RAM buffer, the dirtiness only "increases". 1238 | * 2) If the data is, on the other hand, on the SSD after flushed the dirtiness 1239 | * only "decreases". 1240 | * 1241 | * These simple rules can remove the possibility of dirtiness fluctuate on the 1242 | * RAM buffer. 1243 | */ 1244 | 1245 | /* 1246 | * (Locking) Refcount (in_flight_*) 1247 | * -------------------------------- 1248 | * 1249 | * The basic common idea is 1250 | * 1) Increment the refcount inside lock 1251 | * 2) Wait for decrement outside the lock 1252 | * 1253 | * process_write: 1254 | * do_process_write: 1255 | * mutex_lock (to serialize write) 1256 | * inc in_flight_ios # refcount on the dst segment 1257 | * mutex_unlock 1258 | * 1259 | * complete_process_write: 1260 | * dec in_flight_ios 1261 | * bio_endio(bio) 1262 | */ 1263 | static int process_write_wb(struct wb_device *wb, struct bio *bio) 1264 | { 1265 | int err = do_process_write(wb, bio); 1266 | if (err) { 1267 | bio_io_error(bio); 1268 | return DM_MAPIO_SUBMITTED; 1269 | } 1270 | return complete_process_write(wb, bio); 1271 | } 1272 | 1273 | static int process_write_wa(struct wb_device *wb, struct bio *bio) 1274 | { 1275 | struct lookup_result res; 1276 | 1277 | mutex_lock(&wb->io_lock); 1278 | cache_lookup(wb, bio, &res); 1279 | if (res.found) { 1280 | dec_inflight_ios(wb, res.found_seg); 1281 | ht_del(wb, res.found_mb); 1282 | } 1283 | might_cancel_read_cache_cell(wb, bio); 1284 | mutex_unlock(&wb->io_lock); 1285 | 1286 | bio_remap(bio, wb->backing_dev, bi_sector(bio)); 1287 | return DM_MAPIO_REMAPPED; 1288 | } 1289 | 1290 | static int process_write(struct wb_device *wb, struct bio *bio) 1291 | { 1292 | return wb->write_around_mode ? process_write_wa(wb, bio) : process_write_wb(wb, bio); 1293 | } 1294 | 1295 | struct read_backing_async_context { 1296 | struct wb_device *wb; 1297 | struct bio *bio; 1298 | }; 1299 | 1300 | static void read_backing_async_callback_onstack(unsigned long error, struct read_backing_async_context *ctx) 1301 | { 1302 | ASSERT(bio_is_fullsize(ctx->bio)); 1303 | 1304 | read_cache_cell_copy_data(ctx->wb, ctx->bio, error); 1305 | 1306 | if (error) 1307 | bio_io_error(ctx->bio); 1308 | else 1309 | bio_io_success_compat(ctx->bio); 1310 | } 1311 | 1312 | static void read_backing_async_callback(unsigned long error, void *context) 1313 | { 1314 | struct read_backing_async_context *ctx = context; 1315 | read_backing_async_callback_onstack(error, ctx); 1316 | kfree(ctx); 1317 | } 1318 | 1319 | static int read_backing_async(struct wb_device *wb, struct bio *bio) 1320 | { 1321 | int err = 0; 1322 | 1323 | struct dm_io_request io_req; 1324 | struct dm_io_region region; 1325 | 1326 | struct read_backing_async_context *ctx = kmalloc(sizeof(struct read_backing_async_context), GFP_NOIO); 1327 | if (!ctx) 1328 | return -ENOMEM; 1329 | 1330 | ctx->wb = wb; 1331 | ctx->bio = bio; 1332 | 1333 | ASSERT(bio_is_fullsize(bio)); 1334 | 1335 | io_req = (struct dm_io_request) { 1336 | WB_IO_READ, 1337 | .client = wb->io_client, 1338 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) 1339 | .mem.type = DM_IO_BIO, 1340 | .mem.ptr.bio = bio, 1341 | #else 1342 | .mem.type = DM_IO_BVEC, 1343 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 1344 | #endif 1345 | .notify.fn = read_backing_async_callback, 1346 | .notify.context = ctx 1347 | }; 1348 | region = (struct dm_io_region) { 1349 | .bdev = wb->backing_dev->bdev, 1350 | .sector = bi_sector(bio), 1351 | .count = 8 1352 | }; 1353 | 1354 | err = wb_io(&io_req, 1, ®ion, NULL, false); 1355 | if (err) 1356 | kfree(ctx); 1357 | 1358 | return err; 1359 | } 1360 | 1361 | static int process_read(struct wb_device *wb, struct bio *bio) 1362 | { 1363 | struct lookup_result res; 1364 | struct dirtiness dirtiness; 1365 | struct per_bio_data *pbd; 1366 | 1367 | bool reserved = false; 1368 | 1369 | mutex_lock(&wb->io_lock); 1370 | cache_lookup(wb, bio, &res); 1371 | if (!res.found) 1372 | reserved = reserve_read_cache_cell(wb, bio); 1373 | mutex_unlock(&wb->io_lock); 1374 | 1375 | if (!res.found) { 1376 | if (reserved) { 1377 | /* 1378 | * Remapping clone bio to the backing store leads to 1379 | * empty payload in clone_endio(). 1380 | * To avoid caching junk data, we need this workaround 1381 | * to call dm_io() to certainly fill the bio payload. 1382 | */ 1383 | if (read_backing_async(wb, bio)) { 1384 | struct read_backing_async_context ctx = { 1385 | .wb = wb, 1386 | .bio = bio 1387 | }; 1388 | read_backing_async_callback_onstack(1, &ctx); 1389 | } 1390 | return DM_MAPIO_SUBMITTED; 1391 | } else { 1392 | bio_remap(bio, wb->backing_dev, bi_sector(bio)); 1393 | return DM_MAPIO_REMAPPED; 1394 | } 1395 | } 1396 | 1397 | dirtiness = read_mb_dirtiness(wb, res.found_seg, res.found_mb); 1398 | if (unlikely(res.on_buffer)) { 1399 | int err = fill_payload_by_backing(wb, bio); 1400 | if (err) 1401 | goto read_buffered_mb_exit; 1402 | 1403 | if (dirtiness.is_dirty) 1404 | copy_to_bio_payload(bio, ref_buffered_mb(wb, res.found_mb), dirtiness.data_bits); 1405 | 1406 | read_buffered_mb_exit: 1407 | dec_inflight_ios(wb, res.found_seg); 1408 | 1409 | if (unlikely(err)) 1410 | bio_io_error(bio); 1411 | else 1412 | bio_io_success_compat(bio); 1413 | 1414 | return DM_MAPIO_SUBMITTED; 1415 | } 1416 | 1417 | /* 1418 | * We need to wait for the segment to be flushed to the cache device. 1419 | * Without this, we might read the wrong data from the cache device. 1420 | */ 1421 | wait_for_flushing(wb, res.found_seg->id); 1422 | 1423 | if (unlikely(dirtiness.data_bits != 255)) { 1424 | int err = fill_payload_by_backing(wb, bio); 1425 | if (err) 1426 | goto read_mb_exit; 1427 | 1428 | if (dirtiness.is_dirty) { 1429 | void *buf = read_mb(wb, res.found_seg, res.found_mb, dirtiness.data_bits); 1430 | if (!buf) { 1431 | err = -EIO; 1432 | goto read_mb_exit; 1433 | } 1434 | copy_to_bio_payload(bio, buf, dirtiness.data_bits); 1435 | mempool_free(buf, wb->buf_8_pool); 1436 | } 1437 | 1438 | read_mb_exit: 1439 | dec_inflight_ios(wb, res.found_seg); 1440 | 1441 | if (unlikely(err)) 1442 | bio_io_error(bio); 1443 | else 1444 | bio_io_success_compat(bio); 1445 | 1446 | return DM_MAPIO_SUBMITTED; 1447 | } 1448 | 1449 | pbd = per_bio_data(bio); 1450 | pbd->type = PBD_READ_SEG; 1451 | pbd->seg = res.found_seg; 1452 | 1453 | bio_remap(bio, wb->cache_dev, 1454 | calc_mb_start_sector(wb, res.found_seg, res.found_mb->idx) + 1455 | bio_calc_offset(bio)); 1456 | 1457 | return DM_MAPIO_REMAPPED; 1458 | } 1459 | 1460 | static int process_bio(struct wb_device *wb, struct bio *bio) 1461 | { 1462 | return bio_is_write(bio) ? process_write(wb, bio) : process_read(wb, bio); 1463 | } 1464 | 1465 | static int process_barrier_bio(struct wb_device *wb, struct bio *bio) 1466 | { 1467 | /* barrier bio doesn't have data */ 1468 | ASSERT(bio_sectors(bio) == 0); 1469 | queue_barrier_io(wb, bio); 1470 | return DM_MAPIO_SUBMITTED; 1471 | } 1472 | 1473 | static int writeboost_map(struct dm_target *ti, struct bio *bio) 1474 | { 1475 | struct wb_device *wb = ti->private; 1476 | 1477 | struct per_bio_data *pbd = per_bio_data(bio); 1478 | pbd->type = PBD_NONE; 1479 | 1480 | if (bio_is_barrier(bio)) 1481 | return process_barrier_bio(wb, bio); 1482 | 1483 | return process_bio(wb, bio); 1484 | } 1485 | 1486 | /* 1487 | * DM_ENDIO_DONE was actually introduced since 4.12 but used restrictedly in rq-based dm. 1488 | * In 4.13, a patch titled "dm: change ->end_io calling convention" changed the dm internal 1489 | * so other bio-based dm targets should follow the convension. 1490 | * For this reason, I will start to use the DM_ENDIO_DONE at 4.13. 1491 | */ 1492 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) 1493 | #define DM_ENDIO_DONE_COMPAT DM_ENDIO_DONE 1494 | static int writeboost_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 1495 | #else 1496 | #define DM_ENDIO_DONE_COMPAT 0 1497 | static int writeboost_end_io(struct dm_target *ti, struct bio *bio, int error) 1498 | #endif 1499 | { 1500 | struct wb_device *wb = ti->private; 1501 | struct per_bio_data *pbd = per_bio_data(bio); 1502 | 1503 | switch (pbd->type) { 1504 | case PBD_NONE: 1505 | case PBD_WILL_CACHE: 1506 | return DM_ENDIO_DONE_COMPAT; 1507 | case PBD_READ_SEG: 1508 | dec_inflight_ios(wb, pbd->seg); 1509 | return DM_ENDIO_DONE_COMPAT; 1510 | default: 1511 | BUG(); 1512 | } 1513 | } 1514 | 1515 | static int consume_essential_argv(struct wb_device *wb, struct dm_arg_set *as) 1516 | { 1517 | int err = 0; 1518 | struct dm_target *ti = wb->ti; 1519 | 1520 | err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 1521 | &wb->backing_dev); 1522 | if (err) { 1523 | DMERR("Failed to get backing_dev"); 1524 | return err; 1525 | } 1526 | 1527 | err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 1528 | &wb->cache_dev); 1529 | if (err) { 1530 | DMERR("Failed to get cache_dev"); 1531 | goto bad_get_cache; 1532 | } 1533 | 1534 | return err; 1535 | 1536 | bad_get_cache: 1537 | dm_put_device(ti, wb->backing_dev); 1538 | return err; 1539 | } 1540 | 1541 | #define consume_kv(name, nr, is_static) { \ 1542 | if (!strcasecmp(key, #name)) { \ 1543 | if (!argc) \ 1544 | break; \ 1545 | if (test_bit(WB_CREATED, &wb->flags) && is_static) { \ 1546 | DMERR("%s is a static option", #name); \ 1547 | break; \ 1548 | } \ 1549 | err = dm_read_arg(_args + (nr), as, &tmp, &ti->error); \ 1550 | if (err) { \ 1551 | DMERR("%s", ti->error); \ 1552 | break; \ 1553 | } \ 1554 | wb->name = tmp; \ 1555 | } } 1556 | 1557 | static int do_consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as, unsigned argc) 1558 | { 1559 | int err = 0; 1560 | struct dm_target *ti = wb->ti; 1561 | 1562 | static struct dm_arg _args[] = { 1563 | {0, 100, "Invalid writeback_threshold"}, 1564 | {1, 32, "Invalid nr_max_batched_writeback"}, 1565 | {0, 3600, "Invalid update_sb_record_interval"}, 1566 | {0, 3600, "Invalid sync_data_interval"}, 1567 | {0, 127, "Invalid read_cache_threshold"}, 1568 | {0, 1, "Invalid write_around_mode"}, 1569 | {1, 2048, "Invalid nr_read_cache_cells"}, 1570 | }; 1571 | unsigned tmp; 1572 | 1573 | while (argc) { 1574 | const char *key = dm_shift_arg(as); 1575 | argc--; 1576 | 1577 | err = -EINVAL; 1578 | 1579 | consume_kv(writeback_threshold, 0, false); 1580 | consume_kv(nr_max_batched_writeback, 1, false); 1581 | consume_kv(update_sb_record_interval, 2, false); 1582 | consume_kv(sync_data_interval, 3, false); 1583 | consume_kv(read_cache_threshold, 4, false); 1584 | consume_kv(write_around_mode, 5, true); 1585 | consume_kv(nr_read_cache_cells, 6, true); 1586 | 1587 | if (!err) { 1588 | argc--; 1589 | } else { 1590 | ti->error = "Invalid optional key"; 1591 | break; 1592 | } 1593 | } 1594 | 1595 | return err; 1596 | } 1597 | 1598 | static int consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as) 1599 | { 1600 | int err = 0; 1601 | struct dm_target *ti = wb->ti; 1602 | 1603 | static struct dm_arg _args[] = { 1604 | {0, 14, "Invalid optional argc"}, 1605 | }; 1606 | unsigned argc = 0; 1607 | 1608 | if (as->argc) { 1609 | err = dm_read_arg_group(_args, as, &argc, &ti->error); 1610 | if (err) { 1611 | DMERR("%s", ti->error); 1612 | return err; 1613 | } 1614 | } 1615 | 1616 | return do_consume_optional_argv(wb, as, argc); 1617 | } 1618 | 1619 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(wb_copy_throttle, 1620 | "A percentage of time allocated for one-shot writeback"); 1621 | 1622 | static int init_core_struct(struct dm_target *ti) 1623 | { 1624 | int err = 0; 1625 | struct wb_device *wb; 1626 | 1627 | err = dm_set_target_max_io_len(ti, 1 << 3); 1628 | if (err) { 1629 | DMERR("Failed to set max_io_len"); 1630 | return err; 1631 | } 1632 | 1633 | ti->num_flush_bios = 1; 1634 | ti->flush_supported = true; 1635 | 1636 | /* 1637 | * dm-writeboost does't support TRIM 1638 | * 1639 | * https://github.com/akiradeveloper/dm-writeboost/issues/110 1640 | * - discarding backing data only violates DRAT 1641 | * - strictly discarding both cache blocks and backing data is nearly impossible 1642 | * considering cache hits may occur partially. 1643 | */ 1644 | ti->num_discard_bios = 0; 1645 | ti->discards_supported = false; 1646 | 1647 | ti->PER_BIO_DATA_SIZE = sizeof(struct per_bio_data); 1648 | 1649 | wb = kzalloc(sizeof(*wb), GFP_KERNEL); 1650 | if (!wb) { 1651 | DMERR("Failed to allocate wb"); 1652 | return -ENOMEM; 1653 | } 1654 | ti->private = wb; 1655 | wb->ti = ti; 1656 | 1657 | wb->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 1658 | if (IS_ERR(wb->copier)) { 1659 | err = PTR_ERR(wb->copier); 1660 | goto bad_kcopyd_client; 1661 | } 1662 | 1663 | wb->buf_8_cachep = kmem_cache_create("dmwb_buf_8", 1664 | 1 << 12, 1 << 12, SLAB_RED_ZONE, NULL); 1665 | if (!wb->buf_8_cachep) { 1666 | err = -ENOMEM; 1667 | goto bad_buf_8_cachep; 1668 | } 1669 | wb->buf_8_pool = mempool_create_slab_pool(16, wb->buf_8_cachep); 1670 | if (!wb->buf_8_pool) { 1671 | err = -ENOMEM; 1672 | goto bad_buf_8_pool; 1673 | } 1674 | 1675 | wb->io_wq = create_singlethread_workqueue("dmwb_io"); 1676 | if (!wb->io_wq) { 1677 | DMERR("Failed to allocate io_wq"); 1678 | err = -ENOMEM; 1679 | goto bad_io_wq; 1680 | } 1681 | 1682 | wb->io_client = dm_io_client_create(); 1683 | if (IS_ERR(wb->io_client)) { 1684 | DMERR("Failed to allocate io_client"); 1685 | err = PTR_ERR(wb->io_client); 1686 | goto bad_io_client; 1687 | } 1688 | 1689 | mutex_init(&wb->io_lock); 1690 | init_waitqueue_head(&wb->inflight_ios_wq); 1691 | spin_lock_init(&wb->mb_lock); 1692 | atomic64_set(&wb->nr_dirty_caches, 0); 1693 | clear_bit(WB_CREATED, &wb->flags); 1694 | 1695 | return err; 1696 | 1697 | bad_io_client: 1698 | destroy_workqueue(wb->io_wq); 1699 | bad_io_wq: 1700 | mempool_destroy(wb->buf_8_pool); 1701 | bad_buf_8_pool: 1702 | kmem_cache_destroy(wb->buf_8_cachep); 1703 | bad_buf_8_cachep: 1704 | dm_kcopyd_client_destroy(wb->copier); 1705 | bad_kcopyd_client: 1706 | kfree(wb); 1707 | return err; 1708 | } 1709 | 1710 | static void free_core_struct(struct wb_device *wb) 1711 | { 1712 | dm_io_client_destroy(wb->io_client); 1713 | destroy_workqueue(wb->io_wq); 1714 | mempool_destroy(wb->buf_8_pool); 1715 | kmem_cache_destroy(wb->buf_8_cachep); 1716 | dm_kcopyd_client_destroy(wb->copier); 1717 | kfree(wb); 1718 | } 1719 | 1720 | static int copy_ctr_args(struct wb_device *wb, int argc, const char **argv) 1721 | { 1722 | unsigned i; 1723 | const char **copy; 1724 | 1725 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 1726 | if (!copy) 1727 | return -ENOMEM; 1728 | for (i = 0; i < argc; i++) { 1729 | copy[i] = kstrdup(argv[i], GFP_KERNEL); 1730 | if (!copy[i]) { 1731 | while (i--) 1732 | kfree(copy[i]); 1733 | kfree(copy); 1734 | return -ENOMEM; 1735 | } 1736 | } 1737 | 1738 | wb->nr_ctr_args = argc; 1739 | wb->ctr_args = copy; 1740 | 1741 | return 0; 1742 | } 1743 | 1744 | static void free_ctr_args(struct wb_device *wb) 1745 | { 1746 | int i; 1747 | for (i = 0; i < wb->nr_ctr_args; i++) 1748 | kfree(wb->ctr_args[i]); 1749 | kfree(wb->ctr_args); 1750 | } 1751 | 1752 | #define save_arg(name) wb->name##_saved = wb->name 1753 | #define restore_arg(name) if (wb->name##_saved) { wb->name = wb->name##_saved; } 1754 | 1755 | /* 1756 | * Create a writeboost device 1757 | * 1758 | * 1759 | * <#optional args> 1760 | * optionals are unordered lists of k-v pair. 1761 | * 1762 | * See doc for detail. 1763 | */ 1764 | static int writeboost_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1765 | { 1766 | int err = 0; 1767 | struct wb_device *wb; 1768 | 1769 | struct dm_arg_set as; 1770 | as.argc = argc; 1771 | as.argv = argv; 1772 | 1773 | err = init_core_struct(ti); 1774 | if (err) { 1775 | ti->error = "init_core_struct failed"; 1776 | return err; 1777 | } 1778 | wb = ti->private; 1779 | 1780 | err = copy_ctr_args(wb, argc - 2, (const char **)argv + 2); 1781 | if (err) { 1782 | ti->error = "copy_ctr_args failed"; 1783 | goto bad_ctr_args; 1784 | } 1785 | 1786 | err = consume_essential_argv(wb, &as); 1787 | if (err) { 1788 | ti->error = "consume_essential_argv failed"; 1789 | goto bad_essential_argv; 1790 | } 1791 | 1792 | err = consume_optional_argv(wb, &as); 1793 | if (err) { 1794 | ti->error = "consume_optional_argv failed"; 1795 | goto bad_optional_argv; 1796 | } 1797 | 1798 | save_arg(writeback_threshold); 1799 | save_arg(nr_max_batched_writeback); 1800 | save_arg(update_sb_record_interval); 1801 | save_arg(sync_data_interval); 1802 | save_arg(read_cache_threshold); 1803 | save_arg(nr_read_cache_cells); 1804 | 1805 | err = resume_cache(wb); 1806 | if (err) { 1807 | ti->error = "resume_cache failed"; 1808 | goto bad_resume_cache; 1809 | } 1810 | 1811 | wb->nr_read_cache_cells = 2048; /* 8MB */ 1812 | restore_arg(nr_read_cache_cells); 1813 | err = init_read_cache_cells(wb); 1814 | if (err) { 1815 | ti->error = "init_read_cache_cells failed"; 1816 | goto bad_read_cache_cells; 1817 | } 1818 | 1819 | clear_stat(wb); 1820 | 1821 | set_bit(WB_CREATED, &wb->flags); 1822 | 1823 | restore_arg(writeback_threshold); 1824 | restore_arg(nr_max_batched_writeback); 1825 | restore_arg(update_sb_record_interval); 1826 | restore_arg(sync_data_interval); 1827 | restore_arg(read_cache_threshold); 1828 | 1829 | return err; 1830 | 1831 | bad_read_cache_cells: 1832 | free_cache(wb); 1833 | bad_resume_cache: 1834 | dm_put_device(ti, wb->cache_dev); 1835 | dm_put_device(ti, wb->backing_dev); 1836 | bad_optional_argv: 1837 | bad_essential_argv: 1838 | free_ctr_args(wb); 1839 | bad_ctr_args: 1840 | free_core_struct(wb); 1841 | ti->private = NULL; 1842 | 1843 | return err; 1844 | } 1845 | 1846 | static void writeboost_dtr(struct dm_target *ti) 1847 | { 1848 | struct wb_device *wb = ti->private; 1849 | 1850 | free_read_cache_cells(wb); 1851 | 1852 | free_cache(wb); 1853 | 1854 | dm_put_device(ti, wb->cache_dev); 1855 | dm_put_device(ti, wb->backing_dev); 1856 | 1857 | free_ctr_args(wb); 1858 | 1859 | free_core_struct(wb); 1860 | ti->private = NULL; 1861 | } 1862 | 1863 | /*----------------------------------------------------------------------------*/ 1864 | 1865 | /* 1866 | * .postsuspend is called before .dtr. 1867 | * We flush out all the transient data and make them persistent. 1868 | */ 1869 | static void writeboost_postsuspend(struct dm_target *ti) 1870 | { 1871 | struct wb_device *wb = ti->private; 1872 | flush_current_buffer(wb); 1873 | dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO); 1874 | } 1875 | 1876 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) 1877 | static int writeboost_message(struct dm_target *ti, unsigned argc, char **argv, 1878 | char *result, unsigned maxlen) 1879 | #else 1880 | static int writeboost_message(struct dm_target *ti, unsigned argc, char **argv) 1881 | #endif 1882 | { 1883 | struct wb_device *wb = ti->private; 1884 | 1885 | struct dm_arg_set as; 1886 | as.argc = argc; 1887 | as.argv = argv; 1888 | 1889 | if (!strcasecmp(argv[0], "clear_stat")) { 1890 | clear_stat(wb); 1891 | return 0; 1892 | } 1893 | 1894 | if (!strcasecmp(argv[0], "drop_caches")) { 1895 | int err = 0; 1896 | wb->force_drop = true; 1897 | err = wait_event_interruptible(wb->wait_drop_caches, 1898 | !atomic64_read(&wb->nr_dirty_caches)); 1899 | wb->force_drop = false; 1900 | return err; 1901 | } 1902 | 1903 | return do_consume_optional_argv(wb, &as, 2); 1904 | } 1905 | 1906 | static int writeboost_iterate_devices(struct dm_target *ti, 1907 | iterate_devices_callout_fn fn, void *data) 1908 | { 1909 | int r = 0; 1910 | struct wb_device *wb = ti->private; 1911 | 1912 | r = fn(ti, wb->cache_dev, 0, dm_devsize(wb->cache_dev), data); 1913 | if (!r) 1914 | r = fn(ti, wb->backing_dev, 0, ti->len, data); 1915 | 1916 | return r; 1917 | } 1918 | 1919 | static void writeboost_io_hints(struct dm_target *ti, struct queue_limits *limits) 1920 | { 1921 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(6,12,0) 1922 | limits->io_opt = 4096; 1923 | #else 1924 | blk_limits_io_opt(limits, 4096); 1925 | #endif 1926 | } 1927 | 1928 | static void writeboost_status(struct dm_target *ti, status_type_t type, 1929 | unsigned flags, char *result, unsigned maxlen) 1930 | { 1931 | ssize_t sz = 0; 1932 | char buf[BDEVNAME_SIZE]; 1933 | struct wb_device *wb = ti->private; 1934 | size_t i; 1935 | 1936 | switch (type) { 1937 | case STATUSTYPE_INFO: 1938 | DMEMIT("%u %u %llu %llu %llu %llu %llu", 1939 | (unsigned int) 1940 | wb->cursor, 1941 | (unsigned int) 1942 | wb->nr_caches, 1943 | (long long unsigned int) 1944 | wb->nr_segments, 1945 | (long long unsigned int) 1946 | wb->current_seg->id, 1947 | (long long unsigned int) 1948 | atomic64_read(&wb->last_flushed_segment_id), 1949 | (long long unsigned int) 1950 | atomic64_read(&wb->last_writeback_segment_id), 1951 | (long long unsigned int) 1952 | atomic64_read(&wb->nr_dirty_caches)); 1953 | 1954 | for (i = 0; i < STATLEN; i++) { 1955 | atomic64_t *v = &wb->stat[i]; 1956 | DMEMIT(" %llu", (unsigned long long) atomic64_read(v)); 1957 | } 1958 | DMEMIT(" %llu", (unsigned long long) atomic64_read(&wb->count_non_full_flushed)); 1959 | 1960 | DMEMIT(" %d", 10); 1961 | DMEMIT(" writeback_threshold %d", 1962 | wb->writeback_threshold); 1963 | DMEMIT(" nr_cur_batched_writeback %u", 1964 | wb->nr_cur_batched_writeback); 1965 | DMEMIT(" sync_data_interval %lu", 1966 | wb->sync_data_interval); 1967 | DMEMIT(" update_sb_record_interval %lu", 1968 | wb->update_sb_record_interval); 1969 | DMEMIT(" read_cache_threshold %u", 1970 | wb->read_cache_threshold); 1971 | break; 1972 | 1973 | case STATUSTYPE_TABLE: 1974 | format_dev_t(buf, wb->backing_dev->bdev->bd_dev); 1975 | DMEMIT("%s", buf); 1976 | format_dev_t(buf, wb->cache_dev->bdev->bd_dev); 1977 | DMEMIT(" %s", buf); 1978 | 1979 | for (i = 0; i < wb->nr_ctr_args; i++) 1980 | DMEMIT(" %s", wb->ctr_args[i]); 1981 | break; 1982 | 1983 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0) 1984 | case STATUSTYPE_IMA: 1985 | *result = '\0'; 1986 | break; 1987 | #endif 1988 | } 1989 | } 1990 | 1991 | static struct target_type writeboost_target = { 1992 | .name = "writeboost", 1993 | .version = {2, 2, 19}, 1994 | .module = THIS_MODULE, 1995 | .map = writeboost_map, 1996 | .end_io = writeboost_end_io, 1997 | .ctr = writeboost_ctr, 1998 | .dtr = writeboost_dtr, 1999 | .postsuspend = writeboost_postsuspend, 2000 | .message = writeboost_message, 2001 | .status = writeboost_status, 2002 | .io_hints = writeboost_io_hints, 2003 | .iterate_devices = writeboost_iterate_devices, 2004 | }; 2005 | 2006 | static int __init writeboost_module_init(void) 2007 | { 2008 | int err = 0; 2009 | 2010 | err = dm_register_target(&writeboost_target); 2011 | if (err < 0) { 2012 | DMERR("Failed to register target"); 2013 | return err; 2014 | } 2015 | 2016 | return err; 2017 | } 2018 | 2019 | static void __exit writeboost_module_exit(void) 2020 | { 2021 | dm_unregister_target(&writeboost_target); 2022 | } 2023 | 2024 | module_init(writeboost_module_init); 2025 | module_exit(writeboost_module_exit); 2026 | 2027 | MODULE_AUTHOR("Akira Hayakawa "); 2028 | MODULE_DESCRIPTION(DM_NAME " writeboost target"); 2029 | MODULE_LICENSE("GPL"); 2030 | --------------------------------------------------------------------------------