├── src
    ├── .gitignore
    ├── dkms.conf
    ├── Makefile
    ├── dm-writeboost-daemon.h
    ├── dm-writeboost-metadata.h
    ├── dm-writeboost.h
    ├── dm-writeboost-daemon.c
    ├── dm-writeboost-metadata.c
    └── dm-writeboost-target.c
├── Makefile
├── .github
    └── workflows
    │   └── ci.yml
├── README.md
├── ChangeLog
├── doc
    └── dm-writeboost-readme.txt
└── LICENSE


/src/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !dm-writeboost.h
4 | !dm-writeboost-target.[c]
5 | !dm-writeboost-metadata.[ch]
6 | !dm-writeboost-daemon.[ch]
7 | !dkms.conf
8 | !Makefile


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | MODULE_VERSION ?= 2.2.19
 2 | DKMS_DIR := /usr/src/dm-writeboost-$(MODULE_VERSION)
 3 | DKMS_KEY := -m dm-writeboost -v $(MODULE_VERSION)
 4 | 
 5 | install:
 6 | 	cp -r src $(DKMS_DIR)
 7 | 	dkms add $(DKMS_KEY)
 8 | 	dkms build $(DKMS_KEY)
 9 | 	dkms install $(DKMS_KEY)
10 | 
11 | uninstall:
12 | 	dkms remove --all $(DKMS_KEY)
13 | 	rm -rf $(DKMS_DIR)
14 | 


--------------------------------------------------------------------------------
/src/dkms.conf:
--------------------------------------------------------------------------------
 1 | PACKAGE_NAME="dm-writeboost"
 2 | PACKAGE_VERSION="2.2.19"
 3 | 
 4 | # dm-writeboost builds on top of dm features introduced in Linux 3.9
 5 | BUILD_EXCLUSIVE_KERNEL_MIN="3.9"
 6 | 
 7 | BUILT_MODULE_NAME="dm-writeboost"
 8 | DEST_MODULE_LOCATION="/kernel/drivers/md"
 9 | MAKE="make all KERNEL_TREE=$kernel_source_dir"
10 | CLEAN="make clean"
11 | AUTOINSTALL="yes"
12 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | KERNEL_SOURCE_VERSION ?= $(shell uname -r)
 2 | KERNEL_TREE ?= /lib/modules/$(KERNEL_SOURCE_VERSION)/build
 3 | 
 4 | obj-m := dm-writeboost.o
 5 | dm-writeboost-objs := \
 6 | 	dm-writeboost-target.o \
 7 | 	dm-writeboost-metadata.o \
 8 | 	dm-writeboost-daemon.o
 9 | 
10 | all:
11 | 	$(MAKE) -C $(KERNEL_TREE) M=$(PWD) modules
12 | 
13 | clean:
14 | 	$(MAKE) -C $(KERNEL_TREE) M=$(PWD) clean
15 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - master
 7 |   push:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   run_tests:
13 |     name: Tests
14 |     runs-on: ubuntu-24.04
15 |     steps:
16 |       - name: Kernel version
17 |         run: uname -a
18 | 
19 |       - name: Install dependencies
20 |         uses: awalsh128/cache-apt-pkgs-action@latest
21 |         with:
22 |           packages: cargo dkms cryptsetup xfsprogs dbench stress
23 |           version: 1.0
24 | 
25 |       - name: Check cryptsetup enabled
26 |         run: cryptsetup benchmark -c aes-xts-plain64 -s 512
27 | 
28 |       - name: Checkout device-mapper-tests
29 |         uses: actions/checkout@v4
30 |         with:
31 |           repository: akiradeveloper/device-mapper-tests
32 |           ref: master
33 |           path: tests
34 | 
35 |       - name: Checkout dm-writeboost
36 |         uses: actions/checkout@v4
37 |         with:
38 |           repository: akiradeveloper/dm-writeboost
39 |           path: module
40 | 
41 |       - name: Install dm-writeboost target
42 |         working-directory: module
43 |         run: |
44 |           sudo mkdir -p /var/lib/dkms
45 |           sudo make install
46 | 
47 |       - name: Load dm-writeboost
48 |         run: sudo modprobe dm-writeboost
49 | 
50 |       - name: Checkout dm-writeboost-tools
51 |         uses: actions/checkout@v4
52 |         with:
53 |           repository: akiradeveloper/dm-writeboost-tools
54 |           ref: master
55 |           path: tools
56 | 
57 |       - name: Install dm-writeboost-tools
58 |         working-directory: tools
59 |         run: sudo cargo install --path . --root /usr/local
60 | 
61 |       - name: Test (wb-command)
62 |         working-directory: tests/wb-command-tests
63 |         run: sudo make test
64 | 
65 |       - name: Test (writeboost)
66 |         working-directory: tests/writeboost-tests
67 |         run: sudo make test


--------------------------------------------------------------------------------
/src/dm-writeboost-daemon.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of dm-writeboost
 3 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
 4 |  *
 5 |  * This program is free software; you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation; either version 2 of the License, or
 8 |  * (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  *
15 |  * You should have received a copy of the GNU General Public License along
16 |  * with this program; if not, write to the Free Software Foundation, Inc.,
17 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 |  */
19 | 
20 | #ifndef DM_WRITEBOOST_DAEMON_H
21 | #define DM_WRITEBOOST_DAEMON_H
22 | 
23 | /*----------------------------------------------------------------------------*/
24 | 
25 | int flush_daemon_proc(void *);
26 | void wait_for_flushing(struct wb_device *, u64 id);
27 | 
28 | /*----------------------------------------------------------------------------*/
29 | 
30 | void queue_barrier_io(struct wb_device *, struct bio *);
31 | void flush_barrier_ios(struct work_struct *);
32 | 
33 | /*----------------------------------------------------------------------------*/
34 | 
35 | void update_nr_empty_segs(struct wb_device *);
36 | int writeback_daemon_proc(void *);
37 | void wait_for_writeback(struct wb_device *, u64 id);
38 | void mark_clean_seg(struct wb_device *, struct segment_header *seg);
39 | 
40 | /*----------------------------------------------------------------------------*/
41 | 
42 | int writeback_modulator_proc(void *);
43 | 
44 | /*----------------------------------------------------------------------------*/
45 | 
46 | int data_synchronizer_proc(void *);
47 | 
48 | /*----------------------------------------------------------------------------*/
49 | 
50 | int sb_record_updater_proc(void *);
51 | 
52 | /*----------------------------------------------------------------------------*/
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/src/dm-writeboost-metadata.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of dm-writeboost
 3 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
 4 |  *
 5 |  * This program is free software; you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation; either version 2 of the License, or
 8 |  * (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  *
15 |  * You should have received a copy of the GNU General Public License along
16 |  * with this program; if not, write to the Free Software Foundation, Inc.,
17 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 |  */
19 | 
20 | #ifndef DM_WRITEBOOST_METADATA_H
21 | #define DM_WRITEBOOST_METADATA_H
22 | 
23 | /*----------------------------------------------------------------------------*/
24 | 
25 | struct segment_header *
26 | get_segment_header_by_id(struct wb_device *, u64 segment_id);
27 | struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id);
28 | sector_t calc_mb_start_sector(struct wb_device *, struct segment_header *,
29 | 			      u32 mb_idx);
30 | u8 mb_idx_inseg(struct wb_device *, u32 mb_idx);
31 | struct segment_header *mb_to_seg(struct wb_device *, struct metablock *);
32 | bool is_on_buffer(struct wb_device *, u32 mb_idx);
33 | 
34 | /*----------------------------------------------------------------------------*/
35 | 
36 | struct lookup_key {
37 | 	sector_t sector;
38 | };
39 | 
40 | struct ht_head;
41 | struct ht_head *ht_get_head(struct wb_device *, struct lookup_key *);
42 | struct metablock *ht_lookup(struct wb_device *,
43 | 			    struct ht_head *, struct lookup_key *);
44 | void ht_register(struct wb_device *, struct ht_head *,
45 | 		 struct metablock *, struct lookup_key *);
46 | void ht_del(struct wb_device *, struct metablock *);
47 | void discard_caches_inseg(struct wb_device *, struct segment_header *);
48 | 
49 | /*----------------------------------------------------------------------------*/
50 | 
51 | void prepare_segment_header_device(void *rambuffer, struct wb_device *,
52 | 				   struct segment_header *src);
53 | u32 calc_checksum(void *rambuffer, u8 length);
54 | 
55 | /*----------------------------------------------------------------------------*/
56 | 
57 | int try_alloc_writeback_ios(struct wb_device *, size_t nr_batch, gfp_t gfp);
58 | 
59 | /*----------------------------------------------------------------------------*/
60 | 
61 | int resume_cache(struct wb_device *);
62 | void free_cache(struct wb_device *);
63 | 
64 | /*----------------------------------------------------------------------------*/
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dm-writeboost 
 2 | 
 3 | ![CI](https://github.com/akiradeveloper/dm-writeboost/actions/workflows/ci.yml/badge.svg)
 4 | 
 5 | Log-structured Caching for Linux
 6 | 
 7 | ## Overview
 8 | dm-writeboost is originated from [Disk Caching Disk (DCD)](http://www.ele.uri.edu/research/hpcl/DCD/DCD.html).
 9 | DCD, implemented in Solaris, is an OS-level IO controller that builds logs from in-coming writes
10 | (data and metadata) and then writes the logs sequentially similar to log-structured filesystem.
11 | dm-writeboost implements the concept on Linux's device-mapper in more sophisticated way.
12 | As a further extension, dm-writeboost supports read-caching which also writes data sequentially.
13 | 
14 | ## Documents
15 | - [dm-writeboost-quickstart](https://docs.google.com/presentation/d/1v-L8Ma138o7jNBFqRl0epyc1Lji3XhUH1RGj8p7DVe8/edit?usp=sharing)  
16 | - doc/dm-writeboost-readme.txt  
17 | - [dm-writeboost-internal](https://docs.google.com/presentation/d/1mDh5ct3OR-eRxBbci3LQgaTvUFx9WTLw-kkBxNBeTD8/edit?usp=sharing)  
18 | - [Wiki](https://github.com/akiradeveloper/dm-writeboost/wiki)
19 | 
20 | ## Features
21 | * **Durable**: Any power failure can't break consistency because each log consists of data, metadata and
22 |   the checksum of the log itself.  
23 | * **Lifetime**: Other caching software (e.g. dm-cache) separates data and
24 |   metadata and therefore submits writes to SSD too frequently. dm-writeboost,
25 |   on the other hand, submits only one  writes for hundreds of data and
26 |   metadata updates so the SSD lives longer since SSD's lifetime depends on
27 |   how many writes are submitted.  
28 | * **Fast**: Since the sequential write is the best I/O pattern for every SSD and the code base is optimized for
29 |   in-coming random writes, the write performance is the best among all caching drivers including dm-cache and
30 |   bcache.  
31 | * **Portable**: All kernel version 3.10 or later is supported with minimum compile-time macros.
32 | 
33 | ## Usage
34 | - **Install**: `sudo make install` to install and `sudo make uninstall` to uninstall.
35 |   `sudo make uninstall MODULE_VERSION=xxx` can uninstall specific version that's installed.
36 |   DKMS is required so please install it beforehand. (usually available in package system)
37 | - **Make a device**: Make a script to build a caching device. Please read doc/dm-writeboost-readme.txt for
38 |   the dmsetup command detail.
39 |   After reboot, you need to rebuild the caching device rather than reformatting as in the initial setup.
40 | 
41 | ## Distribution Packages
42 | - [Debian](https://packages.debian.org/search?keywords=dm-writeboost-dkms)  
43 | - [Ubuntu](https://packages.ubuntu.com/search?keywords=dm-writeboost-dkms)  
44 | 
45 | ## Related Projects
46 | * https://github.com/akiradeveloper/dm-writeboost-tools: Tools to help users analyze the state of the cache device  
47 | * https://gitlab.com/onlyjob/writeboost: A management tool including init script  
48 | * https://github.com/akiradeveloper/device-mapper-tests: Testing framework written in Rust
49 | 
50 | ## Related works
51 | * Y. Hu and Q. Yang -- DCD Disk Caching Disk: A New Approach for Boosting I/O Performance (1995)
52 |   (http://www.ele.uri.edu/research/hpcl/DCD/DCD.html)  
53 | * G. Soundararajan et. al. -- Extending SSD Lifetimes with Disk-Based Write Caches (2010)
54 |   (https://www.usenix.org/conference/fast-10/extending-ssd-lifetimes-disk-based-write-caches)  
55 | * Y. Oh -- SSD RAID as Cache (SRC) with Log-structured Approach for Performance and Reliability (2014)
56 |   (https://ysoh.files.wordpress.com/2009/05/dm-src-ibm.pdf)
57 | 
58 | ## Award
59 | Awarded by Japanese OSS Encouragement Award. Thanks!
60 | 
61 | ## License
62 | ```
63 | Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
64 | 
65 | This program is free software; you can redistribute it and/or modify
66 | it under the terms of the GNU General Public License as published by
67 | the Free Software Foundation; either version 2 of the License, or
68 | (at your option) any later version.
69 | 
70 | This program is distributed in the hope that it will be useful,
71 | but WITHOUT ANY WARRANTY; without even the implied warranty of
72 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
73 | GNU General Public License for more details.
74 | 
75 | You should have received a copy of the GNU General Public License along
76 | with this program; if not, write to the Free Software Foundation, Inc.,
77 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
78 | ```
79 | 
80 | ## Developer Info
81 | Akira Hayakawa (@akiradeveloper)  
82 | e-mail: ruby.wktk@gmail.com
83 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
  1 | 2025-07-10  Akira Hayakawa  <ruby.wktk@gmail.com>
  2 | 
  3 | 	* v2.2.19
  4 | 	* Fix build error with 6.12 kernel
  5 | 	* Refactor
  6 | 	* Update copyright year
  7 | 
  8 | 2024-11-12  Akira Hayakawa  <ruby.wktk@gmail.com>
  9 | 
 10 | 	* v2.2.18
 11 | 	* Fix build error with 6.10 kernel
 12 | 
 13 | 2024-05-01  Akira Hayakawa  <ruby.wktk@gmail.com>
 14 | 
 15 | 	* v2.2.17
 16 | 	* Fix build error with 6.9 kernel and backports
 17 | 	* Improve dkms.conf
 18 | 
 19 | 2023-02-11  Akira Hayakawa  <ruby.wktk@gmail.com>
 20 | 
 21 | 	* v2.2.16
 22 | 	* Fix build error with 6.0 kernel
 23 | 	* Handle STATUSTYPE_IMA
 24 | 
 25 | 2021-05-04  Akira Hayakawa  <ruby.wktk@gmail.com>
 26 | 
 27 | 	* v2.2.15
 28 | 	* Fix build error with 5.12 kernel
 29 | 
 30 | 2021-03-21  Akira Hayakawa  <ruby.wktk@gmail.com>
 31 | 
 32 | 	* v2.2.14
 33 | 	* Fix build error with 5.11 kernel
 34 | 	* Update copyright year
 35 | 
 36 | 2020-11-22  Akira Hayakawa  <ruby.wktk@gmail.com>
 37 | 
 38 | 	* v2.2.13
 39 | 	* Fix build error with 5.9 kernel
 40 | 
 41 | 2020-08-09  Akira Hayakawa  <ruby.wktk@gmail.com>
 42 | 
 43 | 	* v2.2.12
 44 | 	* Fix build error with 5.8 kernel
 45 | 
 46 | 2020-06-05  Akira Hayakawa  <ruby.wktk@gmail.com>
 47 | 
 48 | 	* v2.2.11
 49 | 	* Fix build error with 5.7 kernel
 50 | 	* Update copyright year
 51 | 
 52 | 2018-11-08  Akira Hayakawa  <ruby.wktk@gmail.com>
 53 | 
 54 | 	* v2.2.10
 55 | 	* Fix build error with 4.19 kernel
 56 | 
 57 | 2018-06-09  Akira Hayakawa  <ruby.wktk@gmail.com>
 58 | 
 59 | 	* v2.2.9
 60 | 	* Fix build error with 4.15 kernel
 61 | 	* Fix build error with 4.17 kernel
 62 | 
 63 | 2017-10-15  Akira Hayakawa  <ruby.wktk@gmail.com>
 64 | 
 65 | 	* v2.2.8
 66 | 	* Fix build error with 4.14 kernel
 67 | 	* Support 4Kn devices
 68 | 
 69 | 2017-04-13  Akira Hayakawa  <ruby.wktk@gmail.com>
 70 | 
 71 | 	* v2.2.7
 72 | 	* Fix build error with CentOS 7.3
 73 | 	* Wake up writeback thread only when needed
 74 | 	* Fix doc (deprecated --getsize option)
 75 | 
 76 | 2016-09-19  Akira Hayakawa  <ruby.wktk@gmail.com>
 77 | 
 78 | 	* v2.2.6
 79 | 	* Clarify producer-consumer pattern
 80 | 	* Fix build error with 3.10 kernel
 81 | 	* Fix build error with 3.14 kernel
 82 | 
 83 | 2016-09-12  Akira Hayakawa  <ruby.wktk@gmail.com>
 84 | 
 85 | 	* v2.2.5
 86 | 	* Fix read-caching data corruption issue
 87 | 	* Insert memory barriers
 88 | 	* Code cleanup
 89 | 
 90 | 2016-08-28  Akira Hayakawa  <ruby.wktk@gmail.com>
 91 | 
 92 | 	* v2.2.4
 93 | 	* Fix update_sb_record_interval
 94 | 	* Throttle writeback when there are only few empty segments in the
 95 | 	caching device
 96 | 	* Remove experimental from read-caching
 97 | 
 98 | 2016-08-02  Akira Hayakawa  <ruby.wktk@gmail.com>
 99 | 
100 | 	* v2.2.3
101 | 	* Rename write_through_mode to write_around_mode because it's more
102 | 	precise
103 | 	* Reformat the caching device when it's write_around_mode
104 | 
105 | 2016-07-30  Akira Hayakawa  <ruby.wktk@gmail.com>
106 | 
107 | 	* v2.2.2
108 | 	* Use kmap_atomic() to access the bio payload
109 | 	* Fix doc (clear_stat)
110 | 
111 | 2016-07-18  Akira Hayakawa  <ruby.wktk@gmail.com>
112 | 
113 | 	* v2.2.1
114 | 	* Unsupport TRIM
115 | 	* Fixes (fail if partial read from caching device fails etc.)
116 | 
117 | 2016-05-01  Akira Hayakawa  <ruby.wktk@gmail.com>
118 | 
119 | 	* v2.2.0
120 | 	* Remove partial writeback in foreground. This results in writing
121 | 	back cached data strictly from the older ones, which makes cache
122 | 	device corruption safer
123 | 	* Fix build error for kernel 4.6. per_bio_data_size is renamed to
124 | 	per_io_data_size
125 | 	* Remove SECTOR_SHIFT
126 | 
127 | 2016-03-05  Akira Hayakawa  <ruby.wktk@gmail.com>
128 | 
129 | 	* v2.1.2
130 | 	* Remove blockup mechanism
131 | 	* Use vmalloc for read_cache_cell's buffer
132 | 
133 | 2016-01-04  Akira Hayakawa  <ruby.wktk@gmail.com>
134 | 
135 | 	* v2.1.1
136 | 	* Define bio_endio_compat
137 | 	* Update copyright date
138 | 	* Update/fix docs
139 | 
140 | 2015-08-02  Akira Hayakawa  <ruby.wktk@gmail.com>
141 | 
142 | 	* v2.1.0
143 | 	* Remove ACCESS_ONCE around cell->cancelled
144 | 	* Change the type of cell->cancelled from int to bool
145 | 	* Fix dmsetup table
146 | 	* Add write_through_mode
147 | 
148 | 2015-07-28  Akira Hayakawa  <ruby.wktk@gmail.com>
149 | 
150 | 	* v2.0.6
151 | 	* Use vmalloc for rambuf and writeback_segs
152 | 	* Fix location of might_queue_current_buffer() (this is a good
153 | 	refactoring too)
154 | 	* Fix inject_read_cache so it checks cell->cancelled inside mutex.
155 | 	* Fix comment (ctr)
156 | 
157 | 2015-07-20  Akira Hayakawa  <ruby.wktk@gmail.com>
158 | 
159 | 	* v2.0.5
160 | 	* Add __GFP_NOWARN to allocation of writeback ios
161 | 	* Use vmalloc for large_array struct
162 | 
163 | 2015-07-15  Akira Hayakawa  <ruby.wktk@gmail.com>
164 | 
165 | 	* v2.0.4
166 | 	* Fast-path for clean initialization
167 | 	* Restrict the nr_max_batched_writeback
168 | 
169 | 2015-07-13  Akira Hayakawa  <ruby.wktk@gmail.com>
170 | 
171 | 	* v2.0.3
172 | 	* Use separate wq for barrier flush
173 | 
174 | 2015-07-12  Akira Hayakawa  <ruby.wktk@gmail.com>
175 | 
176 | 	* v2.0.2
177 | 	* Fix the crc32c wrapper so it complements the computed value.
178 | 
179 | 2015-07-09  Akira Hayakawa  <ruby.wktk@gmail.com>
180 | 
181 | 	* v2.0.1
182 | 	* Fix for "mkfs.xfs -m crc=1" issue.
183 | 	  Add copy_bio_payload().
184 | 	* Fix end_io not to ignore error.
185 | 	* Fix bad pointer access in try_alloc_writeback_ios().
186 | 
187 | 2015-06-16  Akira Hayakawa  <ruby.wktk@gmail.com>
188 | 
189 | 	* v2.0.0
190 | 	* Design change.
191 | 	  Purge static optional args (nr_rambuf_pool, segment_size_order)
192 | 	  so as to work well with Dmitry's tool.
193 | 
194 | 2015-05-14  Akira Hayakawa  <ruby.wktk@gmail.com>
195 | 
196 | 	* v1.0.1
197 | 	* Fix read-caching that didn't hit at all.
198 | 
199 | 2015-05-10  Akira Hayakawa  <ruby.wktk@gmail.com>
200 | 
201 | 	* v1.0.0
202 | 


--------------------------------------------------------------------------------
/doc/dm-writeboost-readme.txt:
--------------------------------------------------------------------------------
  1 | dm-writeboost
  2 | =============
  3 | dm-writeboost target provides block-level log-structured caching.
  4 | All writes and reads are written to the caching device in sequential manner.
  5 | 
  6 | 
  7 | Mechanism
  8 | =========
  9 | Control three layers (RAM buffer, caching device and backing device)
 10 | --------------------------------------------------------------------
 11 | dm-writeboost controls three different layers - RAM buffer (rambuf), caching
 12 | device (cache_dev, e.g SSD) and backing device (backing_dev, e.g. HDD).
 13 | All data are first stored in the RAM buffer and when the RAM buffer is full,
 14 | dm-writeboost adds metadata block (with checksum) on the RAM buffer to create a
 15 | "log". Afterward, the log is written to the caching device sequentially by a
 16 | background thread and thereafter written back to the backing device in the
 17 | background as well.
 18 | 
 19 | 
 20 | dm-writeboost vs dm-cache or bcache
 21 | ===================================
 22 | How dm-writeboost differs from other existing SSD-caching drivers?
 23 | 
 24 | The most distinctive point is that dm-writeboost writes to caching device the
 25 | least frequently. Because it creates a log that's contains 127 writes before
 26 | it actually writes the log to the caching device, writing to the caching device
 27 | happens only once in 127 writes while other caching drivers writes more often.
 28 | Since SSD's lifetime decreases as it experiences writes, users can reduce the
 29 | risk of SSD disorder.
 30 | 
 31 | dm-writeboost performs very much efficient than other caching solutions in
 32 | small random pattern. But since it always split the requests into 4KB chunks,
 33 | it may not be the best when the ave. I/O size is very large in your workload.
 34 | However, if the splitting overhead aside, dm-writeboost is always the best of
 35 | all because it caches data in sequential manner - the most efficient I/O pattern
 36 | yet for the SSD caching device in terms of performance.
 37 | 
 38 | It's known from experiments that dm-writeboost performs no good when you create
 39 | a dm-writeboost'd device in virtual environment like KVM. So, keep in mind to
 40 | use this driver in a physical machine.
 41 | 
 42 | 
 43 | How To Use dm-writeboost
 44 | ========================
 45 | Trigger caching device reformat
 46 | -------------------------------
 47 | The caching device is triggered reformating only if the first one sector of the
 48 | caching device is zeroed out. Note that this operation should be omitted when
 49 | you resume the caching device.
 50 | e.g. dd if=/dev/zero of=$CACHE oflag=direct bs=512 count=1
 51 | 
 52 | Construct dm-writeboost'd device
 53 | --------------------------------
 54 | You can construct dm-writeboost'd device with dmsetup create command.
 55 | 
 56 | <essential args>
 57 | <#optional args> <optional args>
 58 | 
 59 | - <#optional args> is twice the length of the following list.
 60 | - <optional args> is unordered list of key-value pairs.
 61 | 
 62 | <essential args>
 63 | backing_dev        : A block device having original data (e.g. HDD)
 64 | cache_dev          : A block device having caches (e.g. SSD)
 65 | 
 66 | <optional args>
 67 | see `Optional args`
 68 | 
 69 | e.g.
 70 | BACKING=/dev/sdb # example
 71 | CACHE=/dev/sdc # example
 72 | sz=`blockdev --getsz ${BACKING}`
 73 | dmsetup create wbdev --table "0 $sz writeboost $BACKING $CACHE 2 writeback_threshold 70"
 74 | 
 75 | Shut down the system
 76 | --------------------
 77 | On shutting down the system, you don't need to do anything at all. The data
 78 | and metadata is safely saved on the caching device. But, if you want to do
 79 | deconstruct the device manually, use dmsetup remove.
 80 | 
 81 | Resume after system reboot
 82 | --------------------------
 83 | To resume your caching device of the on-disk state, run dmsetup create command
 84 | with the same parameter but DO NOT zero out the first sector of the caching device.
 85 | This replays the logs on the caching device to rebuild the internal data structures.
 86 | 
 87 | Remove caching device
 88 | ---------------------
 89 | If you want to detach your caching device for some reasons (you don't like
 90 | dm-writeboost anymore or you try to upgrade the caching device to a newly
 91 | perchased device) the safest way to do this is clean the dirty data up from your
 92 | caching device first and then deconstrust the dm-writeboost'd device.
 93 | You can do this by first suspend/resume the device to drop all transient data
 94 | from RAM buffer and then sending drop_caches message to drop dirty cache blocks
 95 | from the caching device.
 96 | e.g.
 97 | dmsetup suspend wbdev; dmsetup resume wbdev
 98 | dmsetup message wbdev 0 drop_caches
 99 | dmsetup remove wbdev
100 | 
101 | Optional args
102 | -------------
103 | writeback_threshold (%)
104 |   accepts: 0..100
105 |   default: 0 (writeback disabled)
106 | Writeback can be suppressed when the load of backing device is higher than
107 | $writeback_threshold.
108 | 
109 | nr_max_batched_writeback
110 |   accepts: 1..32
111 |   default: 32
112 | As optimization, dm-writeboost writes back $nr_max_batched_writeback segments
113 | simultaneously. The dirty caches in the segments are sorted in ascending order
114 | of the destination address and then written back. Setting large value can boost
115 | the writeback performance.
116 | 
117 | update_sb_record_interval (sec)
118 |   accepts: 0..3600
119 |   default: 0 (disabled)
120 | Update the superblock every $update_sb_record_interval second. 0 means disabled.
121 | Superblock memorizes the last segment ID that was written back.
122 | By enabling this, dm-writeboost in resuming can skip segments that's already
123 | written back and thus can shorten the resume time.
124 | 
125 | sync_data_interval (sec)
126 |   accepts: 0..3600
127 |   default: 0 (disabled)
128 | Sync all the volatile data every $sync_data_interval second. 0 means disabled.
129 | 
130 | read_cache_threshold (int)
131 |   accepts: 0..127
132 |   default: 0 (read caching disabled)
133 | Reads larger than $read_cache_threshold * 4KB consecutive won't be staged.
134 | 
135 | write_around_mode (bool)
136 |   accepts: 0..1
137 |   default: 0
138 | By enabling this, dm-writeboost writes data directly to the backing device.
139 | 
140 | Messages
141 | --------
142 | You can change the behavior of dm-writeboost'd device by message.
143 | 
144 | (1) Optional args
145 | The following optional args can be tuned online.
146 | e.g. dmsetup message wbdev 0 writeback_threshold 70
147 | 
148 | - writeback_threshold
149 | - nr_max_batched_writeback
150 | - update_sb_record_interval
151 | - sync_data_interval
152 | - read_cache_threshold
153 | 
154 | (2) Others
155 | drop_caches
156 |   Wait for all dirty data on the caching device to be written back to the backing
157 |   device. This is interruptible.
158 | clear_stat
159 |   Clear the statistic info (see `Status`).
160 | 
161 | Status
162 | ------
163 | <cursor_pos>
164 | <nr_cache_blocks>
165 | <nr_segments>
166 | <current_id>
167 | <last_flushed_id>
168 | <last_writeback_id>
169 | <nr_dirty_cache_blocks>
170 | <stat (write?) x (hit?) x (on buffer?) x (fullsize?)>
171 | <nr_partial_flushed>
172 | <#optional args> <optional args>
173 | 


--------------------------------------------------------------------------------
/src/dm-writeboost.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of dm-writeboost
  3 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
  4 |  *
  5 |  * This program is free software; you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation; either version 2 of the License, or
  8 |  * (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  *
 15 |  * You should have received a copy of the GNU General Public License along
 16 |  * with this program; if not, write to the Free Software Foundation, Inc.,
 17 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 |  */
 19 | 
 20 | #ifndef DM_WRITEBOOST_H
 21 | #define DM_WRITEBOOST_H
 22 | 
 23 | #define DM_MSG_PREFIX "writeboost"
 24 | 
 25 | #include <linux/module.h>
 26 | #include <linux/version.h>
 27 | #include <linux/list.h>
 28 | #include <linux/slab.h>
 29 | #include <linux/vmalloc.h>
 30 | #include <linux/mutex.h>
 31 | #include <linux/kthread.h>
 32 | #include <linux/sched.h>
 33 | #include <linux/timer.h>
 34 | #include <linux/workqueue.h>
 35 | #include <linux/crc32c.h>
 36 | #include <linux/device-mapper.h>
 37 | #include <linux/dm-io.h>
 38 | #include <linux/dm-kcopyd.h>
 39 | 
 40 | /* We use RHEL_RELEASE_VERSION to compile with RHEL/CentOS 7.3's kernel */
 41 | #ifndef RHEL_RELEASE_CODE
 42 | #define RHEL_RELEASE_CODE 0
 43 | #define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
 44 | #endif
 45 | 
 46 | /*----------------------------------------------------------------------------*/
 47 | 
 48 | #define SUB_ID(x, y) ((x) > (y) ? (x) - (y) : 0)
 49 | 
 50 | /*----------------------------------------------------------------------------*/
 51 | 
 52 | /*
 53 |  * The detail of the disk format (SSD)
 54 |  * -----------------------------------
 55 |  *
 56 |  * ### Overall
 57 |  * Superblock (1MB) + Segment + Segment ...
 58 |  *
 59 |  * ### Superblock
 60 |  * Head <----                                     ----> Tail
 61 |  * Superblock Header (512B) + ... + Superblock Record (512B)
 62 |  *
 63 |  * ### Segment
 64 |  * segment_header_device (512B) +
 65 |  * metablock_device * nr_caches_inseg +
 66 |  * data[0] (4KB) + data[1] + ... + data[nr_cache_inseg - 1]
 67 |  */
 68 | 
 69 | /*----------------------------------------------------------------------------*/
 70 | 
 71 | /*
 72 |  * Superblock Header (Immutable)
 73 |  * -----------------------------
 74 |  * First one sector of the super block region whose value is unchanged after
 75 |  * formatted.
 76 |  */
 77 | #define WB_MAGIC 0x57427374 /* Magic number "WBst" */
 78 | struct superblock_header_device {
 79 | 	__le32 magic;
 80 | } __packed;
 81 | 
 82 | /*
 83 |  * Superblock Record (Mutable)
 84 |  * ---------------------------
 85 |  * Last one sector of the superblock region. Record the current cache status if
 86 |  * required.
 87 |  */
 88 | struct superblock_record_device {
 89 | 	__le64 last_writeback_segment_id;
 90 | } __packed;
 91 | 
 92 | /*----------------------------------------------------------------------------*/
 93 | 
 94 | /*
 95 |  * The size must be a factor of one sector to avoid starddling neighboring two
 96 |  * sectors.
 97 |  */
 98 | struct metablock_device {
 99 | 	__le64 sector;
100 | 	__u8 dirty_bits;
101 | 	__u8 padding[16 - (8 + 1)]; /* 16B */
102 | } __packed;
103 | 
104 | struct segment_header_device {
105 | 	/*
106 | 	 * We assume 1 sector write is atomic.
107 | 	 * This 1 sector region contains important information such as checksum
108 | 	 * of the rest of the segment data. We use 32bit checksum to audit if
109 | 	 * the segment is correctly written to the cache device.
110 | 	 */
111 | 	/* - FROM ------------------------------------ */
112 | 	__le64 id;
113 | 	__le32 checksum;
114 | 	/*
115 | 	 * The number of metablocks in this segment header to be considered in
116 | 	 * log replay.
117 | 	 */
118 | 	__u8 length;
119 | 	__u8 padding[512 - (8 + 4 + 1)]; /* 512B */
120 | 	/* - TO -------------------------------------- */
121 | 	struct metablock_device mbarr[0]; /* 16B * N */
122 | } __packed;
123 | 
124 | /*----------------------------------------------------------------------------*/
125 | 
126 | struct dirtiness {
127 | 	bool is_dirty;
128 | 	u8 data_bits;
129 | };
130 | 
131 | struct metablock {
132 | 	sector_t sector; /* The original aligned address */
133 | 
134 | 	u32 idx; /* Const. Index in the metablock array */
135 | 
136 | 	struct hlist_node ht_list; /* Linked to the hash table */
137 | 
138 | 	struct dirtiness dirtiness;
139 | };
140 | 
141 | #define SZ_MAX (~(size_t)0)
142 | struct segment_header {
143 | 	u64 id; /* Must be initialized to 0 */
144 | 
145 | 	u8 length; /* The number of valid metablocks */
146 | 
147 | 	u32 start_idx; /* Const */
148 | 	sector_t start_sector; /* Const */
149 | 
150 | 	atomic_t nr_inflight_ios;
151 | 
152 | 	struct metablock mb_array[0];
153 | };
154 | 
155 | /*----------------------------------------------------------------------------*/
156 | 
157 | /*
158 |  * RAM buffer is a buffer that any dirty data are first written into.
159 |  */
160 | struct rambuffer {
161 | 	struct segment_header *seg;
162 | 	void *data;
163 | 	struct bio_list barrier_ios; /* List of deferred bios */
164 | };
165 | 
166 | /*----------------------------------------------------------------------------*/
167 | 
168 | /*
169 |  * Batched and Sorted Writeback
170 |  * ----------------------------
171 |  *
172 |  * Writeback daemon writes back segments on the cache device effectively.
173 |  * "Batched" means it writes back number of segments at the same time in
174 |  * asynchronous manner.
175 |  * "Sorted" means these writeback IOs are sorted in ascending order of LBA in
176 |  * the backing device. Rb-tree is used to sort the writeback IOs.
177 |  *
178 |  * Reading from the cache device is sequential.
179 |  */
180 | 
181 | /*
182 |  * Writeback of a cache line (or metablock)
183 |  */
184 | struct writeback_io {
185 | 	struct rb_node rb_node;
186 | 
187 | 	sector_t sector; /* Key */
188 | 	u64 id; /* Key */
189 | 
190 | 	void *data;
191 | 	u8 data_bits;
192 | };
193 | #define writeback_io_from_node(node) \
194 | 	rb_entry((node), struct writeback_io, rb_node)
195 | 
196 | /*
197 |  * Writeback of a segment
198 |  */
199 | struct writeback_segment {
200 | 	struct segment_header *seg; /* Segment to write back */
201 | 	struct writeback_io *ios;
202 | 	void *buf; /* Sequentially read */
203 | };
204 | 
205 | /*----------------------------------------------------------------------------*/
206 | 
207 | struct read_cache_cell {
208 | 	sector_t sector;
209 | 	void *data; /* 4KB data read */
210 | 	atomic_t cancelled; /* Don't include this */
211 | 	struct rb_node rb_node;
212 | };
213 | 
214 | struct read_cache_cells {
215 | 	struct mutex lock;
216 | 
217 | 	u32 size;
218 | 	struct read_cache_cell *array;
219 | 	u32 cursor;
220 | 	atomic_t ack_count;
221 | 	sector_t last_sector; /* The last read sector in foreground */
222 | 	u32 seqcount;
223 | 	u32 threshold;
224 | 	bool over_threshold;
225 | 	/*
226 | 	 * We use RB-tree for lookup data structure that all elements are
227 | 	 * sorted. Cells are sorted by the sector so we can easily detect
228 | 	 * sequence.
229 | 	 */
230 | 	struct rb_root rb_root;
231 | 	struct workqueue_struct *wq;
232 | };
233 | 
234 | /*----------------------------------------------------------------------------*/
235 | 
236 | enum STATFLAG {
237 | 	WB_STAT_WRITE = 3, /* Write or read */
238 | 	WB_STAT_HIT = 2, /* Hit or miss */
239 | 	WB_STAT_ON_BUFFER = 1, /* Found on buffer or on the cache device */
240 | 	WB_STAT_FULLSIZE = 0, /* Bio is fullsize or partial */
241 | };
242 | #define STATLEN (1 << 4)
243 | 
244 | enum WB_FLAG {
245 | 	WB_CREATED = 0,
246 | };
247 | 
248 | #define SEGMENT_SIZE_ORDER 10
249 | #define NR_RAMBUF_POOL 8
250 | 
251 | /*
252 |  * The context of the cache target instance.
253 |  */
254 | struct wb_device {
255 | 	struct dm_target *ti;
256 | 
257 | 	struct dm_dev *backing_dev; /* Slow device (HDD) */
258 | 	struct dm_dev *cache_dev; /* Fast device (SSD) */
259 | 
260 | 	bool write_around_mode;
261 | 
262 | 	unsigned nr_ctr_args;
263 | 	const char **ctr_args;
264 | 
265 | 	bool do_format; /* True if it was the first creation */
266 | 	struct mutex io_lock; /* Mutex is light-weighed */
267 | 
268 | 	/*
269 | 	 * Wq to wait for nr_inflight_ios to be zero.
270 | 	 * nr_inflight_ios of segment header increments inside io_lock.
271 | 	 * While the refcount > 0, the segment can not be overwritten since
272 | 	 * there is at least one bio to direct it.
273 | 	 */
274 | 	wait_queue_head_t inflight_ios_wq;
275 | 
276 | 	spinlock_t mb_lock;
277 | 
278 | 	u8 nr_caches_inseg; /* Const */
279 | 
280 | 	struct kmem_cache *buf_8_cachep;
281 | 	mempool_t *buf_8_pool; /* 8 sector buffer pool */
282 | 	struct workqueue_struct *io_wq;
283 | 	struct dm_io_client *io_client;
284 | 
285 | 	/*--------------------------------------------------------------------*/
286 | 
287 | 	/******************
288 | 	 * Current position
289 | 	 ******************/
290 | 
291 | 	u32 cursor; /* Metablock index to write next */
292 | 	struct segment_header *current_seg;
293 | 	struct rambuffer *current_rambuf;
294 | 
295 | 	/*--------------------------------------------------------------------*/
296 | 
297 | 	/**********************
298 | 	 * Segment header array
299 | 	 **********************/
300 | 
301 | 	u32 nr_segments; /* Const */
302 | 	struct large_array *segment_header_array;
303 | 
304 | 	/*--------------------------------------------------------------------*/
305 | 
306 | 	/********************
307 | 	 * Chained Hash table
308 | 	 ********************/
309 | 
310 | 	u32 nr_caches; /* Const */
311 | 	struct large_array *htable;
312 | 	size_t htsize; /* Number of buckets in the hash table */
313 | 
314 | 	/*
315 | 	 * Our hashtable has one special bucket called null head.
316 | 	 * Orphan metablocks are linked to the null head.
317 | 	 */
318 | 	struct ht_head *null_head;
319 | 
320 | 	/*--------------------------------------------------------------------*/
321 | 
322 | 	/*****************
323 | 	 * RAM buffer pool
324 | 	 *****************/
325 | 
326 | 	struct rambuffer *rambuf_pool;
327 | 
328 | 	atomic64_t last_queued_segment_id;
329 | 
330 | 	/*--------------------------------------------------------------------*/
331 | 
332 | 	/********************
333 | 	 * One-shot Writeback
334 | 	 ********************/
335 | 
336 | 	struct dm_kcopyd_client *copier;
337 | 
338 | 	/*--------------------------------------------------------------------*/
339 | 
340 | 	/**************
341 | 	 * Flush Daemon
342 | 	 **************/
343 | 
344 | 	struct task_struct *flush_daemon;
345 | 
346 | 	/*
347 | 	 * Wait for a specified segment to be flushed. Non-interruptible
348 | 	 * cf. wait_for_flushing()
349 | 	 */
350 | 	wait_queue_head_t flush_wait_queue;
351 | 
352 | 	atomic64_t last_flushed_segment_id;
353 | 
354 | 	/*--------------------------------------------------------------------*/
355 | 
356 | 	/*************************
357 | 	 * Barrier deadline worker
358 | 	 *************************/
359 | 
360 | 	/*
361 | 	 * We shouldn't use kernel-global workqueue for this worker
362 | 	 * because it may cause timeout for the flush requests.
363 | 	 */
364 | 	struct workqueue_struct *barrier_wq;
365 | 	struct work_struct flush_barrier_work;
366 | 	struct bio_list barrier_ios; /* List of barrier requests */
367 | 
368 | 	/*--------------------------------------------------------------------*/
369 | 
370 | 	/******************
371 | 	 * Writeback Daemon
372 | 	 ******************/
373 | 
374 | 	struct task_struct *writeback_daemon;
375 | 	int allow_writeback;
376 | 	int urge_writeback; /* Start writeback immediately */
377 | 	int force_drop; /* Don't stop writeback */
378 | 	atomic64_t last_writeback_segment_id;
379 | 
380 | 	/*
381 | 	 * Wait for a specified segment to be written back. Non-interruptible
382 | 	 * cf. wait_for_writeback()
383 | 	 */
384 | 	wait_queue_head_t writeback_wait_queue;
385 | 
386 | 	/*
387 | 	 * Wait for writing back all the dirty caches. Interruptible
388 | 	 */
389 | 	wait_queue_head_t wait_drop_caches;
390 | 	atomic64_t nr_dirty_caches;
391 | 
392 | 	/*
393 | 	 * Wait for a background writeback complete
394 | 	 */
395 | 	wait_queue_head_t writeback_io_wait_queue;
396 | 	atomic_t writeback_io_count;
397 | 	atomic_t writeback_fail_count;
398 | 
399 | 	u32 nr_max_batched_writeback; /* Tunable */
400 | 	u32 nr_max_batched_writeback_saved;
401 | 
402 | 	struct rb_root writeback_tree;
403 | 
404 | 	u32 nr_writeback_segs;
405 | 	struct writeback_segment **writeback_segs;
406 | 	u32 nr_cur_batched_writeback; /* Number of segments to be written back */
407 | 	u32 nr_empty_segs;
408 | 
409 | 	/*--------------------------------------------------------------------*/
410 | 
411 | 	/*********************
412 | 	 * Writeback Modulator
413 | 	 *********************/
414 | 
415 | 	struct task_struct *writeback_modulator;
416 | 	u8 writeback_threshold; /* Tunable */
417 | 	u8 writeback_threshold_saved;
418 | 
419 | 	/*--------------------------------------------------------------------*/
420 | 
421 | 	/***************************
422 | 	 * Superblock Record Updater
423 | 	 ***************************/
424 | 
425 | 	struct task_struct *sb_record_updater;
426 | 	unsigned long update_sb_record_interval; /* Tunable */
427 | 	unsigned long update_sb_record_interval_saved;
428 | 
429 | 	/*--------------------------------------------------------------------*/
430 | 
431 | 	/*******************
432 | 	 * Data Synchronizer
433 | 	 *******************/
434 | 
435 | 	struct task_struct *data_synchronizer;
436 | 	unsigned long sync_data_interval; /* Tunable */
437 | 	unsigned long sync_data_interval_saved;
438 | 
439 | 	/*--------------------------------------------------------------------*/
440 | 
441 | 	/**************
442 | 	 * Read Caching
443 | 	 **************/
444 | 
445 | 	u32 nr_read_cache_cells;
446 | 	u32 nr_read_cache_cells_saved;
447 | 	struct work_struct read_cache_work;
448 | 	struct read_cache_cells *read_cache_cells;
449 | 	u32 read_cache_threshold; /* Tunable */
450 | 	u32 read_cache_threshold_saved;
451 | 
452 | 	/*--------------------------------------------------------------------*/
453 | 
454 | 	/************
455 | 	 * Statistics
456 | 	 ************/
457 | 
458 | 	atomic64_t stat[STATLEN];
459 | 	atomic64_t count_non_full_flushed;
460 | 
461 | 	/*--------------------------------------------------------------------*/
462 | 
463 | 	unsigned long flags;
464 | };
465 | 
466 | /*----------------------------------------------------------------------------*/
467 | 
468 | struct write_io {
469 | 	void *data; /* 4KB */
470 | 	u8 data_bits;
471 | };
472 | 
473 | void acquire_new_seg(struct wb_device *, u64 id);
474 | void cursor_init(struct wb_device *);
475 | void flush_current_buffer(struct wb_device *);
476 | void inc_nr_dirty_caches(struct wb_device *);
477 | void dec_nr_dirty_caches(struct wb_device *);
478 | bool mark_clean_mb(struct wb_device *, struct metablock *);
479 | struct dirtiness read_mb_dirtiness(struct wb_device *, struct segment_header *, struct metablock *);
480 | int prepare_overwrite(struct wb_device *, struct segment_header *, struct metablock *old_mb, struct write_io *, u8 overwrite_bits);
481 | 
482 | /*----------------------------------------------------------------------------*/
483 | 
484 | #define ASSERT(cond) BUG_ON(!(cond))
485 | 
486 | #define check_buffer_alignment(buf) \
487 | 	do_check_buffer_alignment(buf, #buf, __func__)
488 | void do_check_buffer_alignment(void *, const char *, const char *);
489 | 
490 | void bio_io_success_compat(struct bio *bio);
491 | 
492 | /*
493 |  * dm_io wrapper
494 |  * thread: run dm_io in other thread to avoid deadlock
495 |  */
496 | #define wb_io(io_req, num_regions, regions, err_bits, thread) \
497 | 	wb_io_internal(wb, (io_req), (num_regions), (regions), \
498 | 			    (err_bits), (thread), __func__)
499 | int wb_io_internal(struct wb_device *, struct dm_io_request *,
500 | 			unsigned num_regions, struct dm_io_region *,
501 | 			unsigned long *err_bits, bool thread, const char *caller);
502 | 
503 | sector_t dm_devsize(struct dm_dev *);
504 | 
505 | /*----------------------------------------------------------------------------*/
506 | 
507 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(6,0,0)
508 | #define req_is_write(req) op_is_write((req)->bi_opf)
509 | #define WB_IO_WRITE .bi_opf = REQ_OP_WRITE
510 | #define WB_IO_READ .bi_opf = REQ_OP_READ
511 | #define WB_IO_WRITE_FUA .bi_opf = REQ_OP_WRITE | REQ_FUA
512 | #define bio_is_barrier(bio) ((bio)->bi_opf & REQ_PREFLUSH)
513 | #define bio_is_fua(bio) ((bio)->bi_opf & REQ_FUA)
514 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
515 | #define req_is_write(req) op_is_write((req)->bi_op)
516 | #define WB_IO_WRITE .bi_op = REQ_OP_WRITE, .bi_op_flags = 0
517 | #define WB_IO_READ .bi_op = REQ_OP_READ, .bi_op_flags = 0
518 | #define WB_IO_WRITE_FUA .bi_op = REQ_OP_WRITE, .bi_op_flags = REQ_FUA
519 | #define bio_is_barrier(bio) ((bio)->bi_opf & REQ_PREFLUSH)
520 | #define bio_is_fua(bio) ((bio)->bi_opf & REQ_FUA)
521 | #else
522 | #define req_is_write(req) ((req)->bi_rw == WRITE)
523 | #define bio_is_barrier(bio) ((bio)->bi_rw & REQ_FLUSH)
524 | #define bio_is_fua(bio) ((bio)->bi_rw & REQ_FUA)
525 | #define WB_IO_WRITE .bi_rw = WRITE
526 | #define WB_IO_READ .bi_rw = READ
527 | #define WB_IO_WRITE_FUA .bi_rw = WRITE_FUA
528 | #endif
529 | 
530 | /*----------------------------------------------------------------------------*/
531 | 
532 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
533 | #define read_once(x) READ_ONCE(x)
534 | #else
535 | #define read_once(x) ACCESS_ONCE(x)
536 | #endif
537 | 
538 | /*----------------------------------------------------------------------------*/
539 | 
540 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,12,0)
541 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x)
542 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)
543 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x, y)
544 | #else
545 | #define dm_blkdev_issue_flush(x, y) blkdev_issue_flush(x, y, NULL)
546 | #endif
547 | 
548 | #endif
549 | 


--------------------------------------------------------------------------------
/src/dm-writeboost-daemon.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of dm-writeboost
  3 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
  4 |  *
  5 |  * This program is free software; you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation; either version 2 of the License, or
  8 |  * (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  *
 15 |  * You should have received a copy of the GNU General Public License along
 16 |  * with this program; if not, write to the Free Software Foundation, Inc.,
 17 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 |  */
 19 | 
 20 | #include "dm-writeboost.h"
 21 | #include "dm-writeboost-metadata.h"
 22 | #include "dm-writeboost-daemon.h"
 23 | 
 24 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,7,0)
 25 | #include <linux/part_stat.h>
 26 | #endif
 27 | #include <linux/rbtree.h>
 28 | 
 29 | /*----------------------------------------------------------------------------*/
 30 | 
 31 | void queue_barrier_io(struct wb_device *wb, struct bio *bio)
 32 | {
 33 | 	mutex_lock(&wb->io_lock);
 34 | 	bio_list_add(&wb->barrier_ios, bio);
 35 | 	mutex_unlock(&wb->io_lock);
 36 | 
 37 | 	/*
 38 | 	 * queue_work does nothing if the work is already in the queue.
 39 | 	 * So we don't have to care about it.
 40 | 	 */
 41 | 	queue_work(wb->barrier_wq, &wb->flush_barrier_work);
 42 | }
 43 | 
 44 | void flush_barrier_ios(struct work_struct *work)
 45 | {
 46 | 	struct wb_device *wb = container_of(
 47 | 		work, struct wb_device, flush_barrier_work);
 48 | 
 49 | 	if (bio_list_empty(&wb->barrier_ios))
 50 | 		return;
 51 | 
 52 | 	atomic64_inc(&wb->count_non_full_flushed);
 53 | 	flush_current_buffer(wb);
 54 | }
 55 | 
 56 | /*----------------------------------------------------------------------------*/
 57 | 
 58 | static void process_deferred_barriers(struct wb_device *wb, struct rambuffer *rambuf)
 59 | {
 60 | 	bool has_barrier = !bio_list_empty(&rambuf->barrier_ios);
 61 | 	if (has_barrier) {
 62 | 		struct bio *bio;
 63 | 
 64 | 		/* Make all the preceding data persistent. */
 65 | 		int err = dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO);
 66 | 
 67 | 		/* Ack the chained barrier requests. */
 68 | 		while ((bio = bio_list_pop(&rambuf->barrier_ios)))
 69 | 			/*
 70 | 			 * We won't endio with the err returned from blkdev_issue_flush
 71 | 			 * because it's sort of meaningless to return a detailed error here
 72 | 			 * and other parts of the code even in foreground round the error
 73 | 			 * off to bio_io_error which returns a generic error which results in
 74 | 			 * IOERR in userland.
 75 | 			 */
 76 | 			if (unlikely(err))
 77 | 				bio_io_error(bio);
 78 | 			else
 79 | 				bio_io_success_compat(bio);
 80 | 	}
 81 | }
 82 | 
 83 | static bool should_flush(struct wb_device *wb)
 84 | {
 85 | 	return atomic64_read(&wb->last_queued_segment_id) >
 86 | 	       atomic64_read(&wb->last_flushed_segment_id);
 87 | }
 88 | 
 89 | static void do_flush_proc(struct wb_device *wb)
 90 | {
 91 | 	struct segment_header *seg;
 92 | 	struct rambuffer *rambuf;
 93 | 	u64 id;
 94 | 	struct dm_io_request io_req;
 95 | 	struct dm_io_region region;
 96 | 
 97 | 	if (!should_flush(wb)) {
 98 | 		schedule_timeout_interruptible(msecs_to_jiffies(1000));
 99 | 		return;
100 | 	}
101 | 
102 | 	id = atomic64_read(&wb->last_flushed_segment_id) + 1;
103 | 
104 | 	smp_rmb();
105 | 
106 | 	rambuf = get_rambuffer_by_id(wb, id);
107 | 	seg = rambuf->seg;
108 | 
109 | 	io_req = (struct dm_io_request) {
110 | 		WB_IO_WRITE,
111 | 		.client = wb->io_client,
112 | 		.notify.fn = NULL,
113 | 		.mem.type = DM_IO_VMA,
114 | 		.mem.ptr.addr = rambuf->data,
115 | 	};
116 | 	region = (struct dm_io_region) {
117 | 		.bdev = wb->cache_dev->bdev,
118 | 		.sector = seg->start_sector,
119 | 		.count = (seg->length + 1) << 3,
120 | 	};
121 | 
122 | 	if (wb_io(&io_req, 1, &region, NULL, false))
123 | 		return;
124 | 
125 | 	/*
126 | 	 * Deferred ACK for barrier requests
127 | 	 * To serialize barrier ACK in logging we wait for the previous segment
128 | 	 * to be persistently written (if needed).
129 | 	 */
130 | 	process_deferred_barriers(wb, rambuf);
131 | 
132 | 	/*
133 | 	 * We can count up the last_flushed_segment_id only after segment
134 | 	 * is written persistently. Counting up the id is serialized.
135 | 	 */
136 | 	smp_wmb();
137 | 	atomic64_inc(&wb->last_flushed_segment_id);
138 | 	wake_up(&wb->flush_wait_queue);
139 | }
140 | 
141 | int flush_daemon_proc(void *data)
142 | {
143 | 	struct wb_device *wb = data;
144 | 	while (!kthread_should_stop())
145 | 		do_flush_proc(wb);
146 | 	return 0;
147 | }
148 | 
149 | void wait_for_flushing(struct wb_device *wb, u64 id)
150 | {
151 | 	wait_event(wb->flush_wait_queue,
152 | 		atomic64_read(&wb->last_flushed_segment_id) >= id);
153 | 	smp_rmb();
154 | }
155 | 
156 | /*----------------------------------------------------------------------------*/
157 | 
158 | static void writeback_endio(unsigned long error, void *context)
159 | {
160 | 	struct wb_device *wb = context;
161 | 
162 | 	if (error)
163 | 		atomic_inc(&wb->writeback_fail_count);
164 | 
165 | 	if (atomic_dec_and_test(&wb->writeback_io_count))
166 | 		wake_up(&wb->writeback_io_wait_queue);
167 | }
168 | 
169 | static void submit_writeback_io(struct wb_device *wb, struct writeback_io *writeback_io)
170 | {
171 | 	ASSERT(writeback_io->data_bits > 0);
172 | 
173 | 	if (writeback_io->data_bits == 255) {
174 | 		struct dm_io_request io_req_w = {
175 | 			WB_IO_WRITE,
176 | 			.client = wb->io_client,
177 | 			.notify.fn = writeback_endio,
178 | 			.notify.context = wb,
179 | 			.mem.type = DM_IO_VMA,
180 | 			.mem.ptr.addr = writeback_io->data,
181 | 		};
182 | 		struct dm_io_region region_w = {
183 | 			.bdev = wb->backing_dev->bdev,
184 | 			.sector = writeback_io->sector,
185 | 			.count = 1 << 3,
186 | 		};
187 | 		if (wb_io(&io_req_w, 1, &region_w, NULL, false))
188 | 			writeback_endio(1, wb);
189 | 	} else {
190 | 		u8 i;
191 | 		for (i = 0; i < 8; i++) {
192 | 			struct dm_io_request io_req_w;
193 | 			struct dm_io_region region_w;
194 | 
195 | 			bool bit_on = writeback_io->data_bits & (1 << i);
196 | 			if (!bit_on)
197 | 				continue;
198 | 
199 | 			io_req_w = (struct dm_io_request) {
200 | 				WB_IO_WRITE,
201 | 				.client = wb->io_client,
202 | 				.notify.fn = writeback_endio,
203 | 				.notify.context = wb,
204 | 				.mem.type = DM_IO_VMA,
205 | 				.mem.ptr.addr = writeback_io->data + (i << 9),
206 | 			};
207 | 			region_w = (struct dm_io_region) {
208 | 				.bdev = wb->backing_dev->bdev,
209 | 				.sector = writeback_io->sector + i,
210 | 				.count = 1,
211 | 			};
212 | 			if (wb_io(&io_req_w, 1, &region_w, NULL, false))
213 | 				writeback_endio(1, wb);
214 | 		}
215 | 	}
216 | }
217 | 
218 | static void submit_writeback_ios(struct wb_device *wb)
219 | {
220 | 	struct blk_plug plug;
221 | 	struct rb_root wt = wb->writeback_tree;
222 | 	blk_start_plug(&plug);
223 | 	while (!RB_EMPTY_ROOT(&wt)) {
224 | 		struct writeback_io *writeback_io = writeback_io_from_node(rb_first(&wt));
225 | 		rb_erase(&writeback_io->rb_node, &wt);
226 | 		submit_writeback_io(wb, writeback_io);
227 | 	}
228 | 	blk_finish_plug(&plug);
229 | }
230 | 
231 | /*
232 |  * Compare two writeback IOs
233 |  * If the two have the same sector then compare them with the IDs.
234 |  * We process the older ID first and then overwrites with the older.
235 |  *
236 |  * (10, 3) < (11, 1)
237 |  * (10, 3) < (10, 4)
238 |  */
239 | static bool compare_writeback_io(struct writeback_io *a, struct writeback_io *b)
240 | {
241 | 	ASSERT(a);
242 | 	ASSERT(b);
243 | 	if (a->sector < b->sector)
244 | 		return true;
245 | 	if (a->id < b->id)
246 | 		return true;
247 | 	return false;
248 | }
249 | 
250 | static void inc_writeback_io_count(u8 data_bits, size_t *writeback_io_count)
251 | {
252 | 	if (data_bits == 255) {
253 | 		(*writeback_io_count)++;
254 | 	} else {
255 | 		u8 i;
256 | 		for (i = 0; i < 8; i++) {
257 | 			if (data_bits & (1 << i))
258 | 				(*writeback_io_count)++;
259 | 		}
260 | 	}
261 | }
262 | 
263 | /*
264 |  * Add writeback IO to RB-tree for sorted writeback.
265 |  * All writeback IOs are sorted in ascending order.
266 |  */
267 | static void add_writeback_io(struct wb_device *wb, struct writeback_io *writeback_io)
268 | {
269 | 	struct rb_node **rbp, *parent;
270 | 	rbp = &wb->writeback_tree.rb_node;
271 | 	parent = NULL;
272 | 	while (*rbp) {
273 | 		struct writeback_io *parent_io;
274 | 		parent = *rbp;
275 | 		parent_io = writeback_io_from_node(parent);
276 | 
277 | 		if (compare_writeback_io(writeback_io, parent_io))
278 | 			rbp = &(*rbp)->rb_left;
279 | 		else
280 | 			rbp = &(*rbp)->rb_right;
281 | 	}
282 | 	rb_link_node(&writeback_io->rb_node, parent, rbp);
283 | 	rb_insert_color(&writeback_io->rb_node, &wb->writeback_tree);
284 | }
285 | 
286 | static int fill_writeback_seg(struct wb_device *wb, struct writeback_segment *writeback_seg)
287 | {
288 | 	struct segment_header *seg = writeback_seg->seg;
289 | 
290 | 	struct dm_io_request io_req_r = {
291 | 		WB_IO_READ,
292 | 		.client = wb->io_client,
293 | 		.notify.fn = NULL,
294 | 		.mem.type = DM_IO_VMA,
295 | 		.mem.ptr.addr = writeback_seg->buf,
296 | 	};
297 | 	struct dm_io_region region_r = {
298 | 		.bdev = wb->cache_dev->bdev,
299 | 		.sector = seg->start_sector + (1 << 3), /* Header excluded */
300 | 		.count = seg->length << 3,
301 | 	};
302 | 
303 | 	/*
304 | 	 * dm_io() allows region.count = 0
305 | 	 * so we don't need to skip here in case of seg->length = 0
306 | 	 */
307 | 	return wb_io(&io_req_r, 1, &region_r, NULL, false);
308 | }
309 | 
310 | static void prepare_writeback_ios(struct wb_device *wb, struct writeback_segment *writeback_seg,
311 | 				  size_t *writeback_io_count)
312 | {
313 | 	struct segment_header *seg = writeback_seg->seg;
314 | 
315 | 	u8 i;
316 | 	for (i = 0; i < seg->length; i++) {
317 | 		struct writeback_io *writeback_io;
318 | 
319 | 		struct metablock *mb = seg->mb_array + i;
320 | 		struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, mb);
321 | 		ASSERT(dirtiness.data_bits > 0);
322 | 		if (!dirtiness.is_dirty)
323 | 			continue;
324 | 
325 | 		writeback_io = writeback_seg->ios + i;
326 | 		writeback_io->sector = mb->sector;
327 | 		writeback_io->id = seg->id;
328 | 		/* writeback_io->data is already set */
329 | 		writeback_io->data_bits = dirtiness.data_bits;
330 | 
331 | 		inc_writeback_io_count(writeback_io->data_bits, writeback_io_count);
332 | 		add_writeback_io(wb, writeback_io);
333 | 	}
334 | }
335 | 
336 | void mark_clean_seg(struct wb_device *wb, struct segment_header *seg)
337 | {
338 | 	u8 i;
339 | 	for (i = 0; i < seg->length; i++) {
340 | 		struct metablock *mb = seg->mb_array + i;
341 | 		if (mark_clean_mb(wb, mb))
342 | 			dec_nr_dirty_caches(wb);
343 | 	}
344 | }
345 | 
346 | /*
347 |  * Try writeback some specified segs and returns if all writeback ios succeeded.
348 |  */
349 | static bool try_writeback_segs(struct wb_device *wb)
350 | {
351 | 	struct writeback_segment *writeback_seg;
352 | 	size_t writeback_io_count = 0;
353 | 	u32 k;
354 | 
355 | 	/* Create RB-tree */
356 | 	wb->writeback_tree = RB_ROOT;
357 | 	for (k = 0; k < wb->nr_cur_batched_writeback; k++) {
358 | 		writeback_seg = *(wb->writeback_segs + k);
359 | 
360 | 		if (fill_writeback_seg(wb, writeback_seg))
361 | 			return false;
362 | 
363 | 		prepare_writeback_ios(wb, writeback_seg, &writeback_io_count);
364 | 	}
365 | 
366 | 	atomic_set(&wb->writeback_io_count, writeback_io_count);
367 | 	atomic_set(&wb->writeback_fail_count, 0);
368 | 
369 | 	/* Pop rbnodes out of the tree and submit writeback I/Os */
370 | 	submit_writeback_ios(wb);
371 | 	wait_event(wb->writeback_io_wait_queue, !atomic_read(&wb->writeback_io_count));
372 | 
373 | 	return atomic_read(&wb->writeback_fail_count) == 0;
374 | }
375 | 
376 | static bool do_writeback_segs(struct wb_device *wb)
377 | {
378 | 	if (!try_writeback_segs(wb))
379 | 		return false;
380 | 
381 | 	dm_blkdev_issue_flush(wb->backing_dev->bdev, GFP_NOIO);
382 | 	return true;
383 | }
384 | 
385 | /*
386 |  * Calculate the number of segments to write back.
387 |  */
388 | void update_nr_empty_segs(struct wb_device *wb)
389 | {
390 | 	wb->nr_empty_segs =
391 | 		atomic64_read(&wb->last_writeback_segment_id) + wb->nr_segments
392 | 		- wb->current_seg->id;
393 | }
394 | 
395 | static u32 calc_nr_writeback(struct wb_device *wb)
396 | {
397 | 	u32 nr_writeback_candidates =
398 | 		atomic64_read(&wb->last_flushed_segment_id)
399 | 		- atomic64_read(&wb->last_writeback_segment_id);
400 | 
401 | 	u32 nr_max_batch = read_once(wb->nr_max_batched_writeback);
402 | 	if (wb->nr_writeback_segs != nr_max_batch)
403 | 		try_alloc_writeback_ios(wb, nr_max_batch, GFP_NOIO | __GFP_NOWARN);
404 | 
405 | 	return min3(nr_writeback_candidates, wb->nr_writeback_segs, wb->nr_empty_segs + 1);
406 | }
407 | 
408 | static bool should_writeback(struct wb_device *wb)
409 | {
410 | 	return read_once(wb->allow_writeback) ||
411 | 	       read_once(wb->urge_writeback)  ||
412 | 	       read_once(wb->force_drop);
413 | }
414 | 
415 | static void do_writeback_proc(struct wb_device *wb)
416 | {
417 | 	u32 k, nr_writeback_tbd;
418 | 
419 | 	if (!should_writeback(wb)) {
420 | 		schedule_timeout_interruptible(msecs_to_jiffies(1000));
421 | 		return;
422 | 	}
423 | 
424 | 	nr_writeback_tbd = calc_nr_writeback(wb);
425 | 	if (!nr_writeback_tbd) {
426 | 		schedule_timeout_interruptible(msecs_to_jiffies(1000));
427 | 		return;
428 | 	}
429 | 
430 | 	smp_rmb();
431 | 
432 | 	/* Store segments into writeback_segs */
433 | 	for (k = 0; k < nr_writeback_tbd; k++) {
434 | 		struct writeback_segment *writeback_seg = *(wb->writeback_segs + k);
435 | 		writeback_seg->seg = get_segment_header_by_id(wb,
436 | 			atomic64_read(&wb->last_writeback_segment_id) + 1 + k);
437 | 	}
438 | 	wb->nr_cur_batched_writeback = nr_writeback_tbd;
439 | 
440 | 	if (!do_writeback_segs(wb))
441 | 		return;
442 | 
443 | 	/* A segment after written back is clean */
444 | 	for (k = 0; k < wb->nr_cur_batched_writeback; k++) {
445 | 		struct writeback_segment *writeback_seg = *(wb->writeback_segs + k);
446 | 		mark_clean_seg(wb, writeback_seg->seg);
447 | 	}
448 | 
449 | 	smp_wmb();
450 | 	atomic64_add(wb->nr_cur_batched_writeback, &wb->last_writeback_segment_id);
451 | 	wake_up(&wb->writeback_wait_queue);
452 | }
453 | 
454 | int writeback_daemon_proc(void *data)
455 | {
456 | 	struct wb_device *wb = data;
457 | 	while (!kthread_should_stop())
458 | 		do_writeback_proc(wb);
459 | 	return 0;
460 | }
461 | 
462 | /*
463 |  * Wait for a segment to be written back.
464 |  * The segment after written back is clean.
465 |  */
466 | void wait_for_writeback(struct wb_device *wb, u64 id)
467 | {
468 | 	if (atomic64_read(&wb->last_writeback_segment_id) < id) {
469 | 		wb->urge_writeback = true;
470 | 		wake_up_process(wb->writeback_daemon);
471 | 		wait_event(wb->writeback_wait_queue,
472 | 			atomic64_read(&wb->last_writeback_segment_id) >= id);
473 | 		wb->urge_writeback = false;
474 | 	}
475 | 	smp_rmb();
476 | }
477 | 
478 | /*----------------------------------------------------------------------------*/
479 | 
480 | int writeback_modulator_proc(void *data)
481 | {
482 | 	struct wb_device *wb = data;
483 | 
484 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
485 | 	struct block_device *part = wb->backing_dev->bdev;
486 | #else
487 | 	struct hd_struct *part = wb->backing_dev->bdev->bd_part;
488 | #endif
489 | 	unsigned long old = 0, new, util;
490 | 	unsigned long intvl = 1000;
491 | 
492 | 	while (!kthread_should_stop()) {
493 | 		new = jiffies_to_msecs(part_stat_read(part, io_ticks));
494 | 
495 | 		util = div_u64(100 * (new - old), 1000);
496 | 
497 | 		if (util < read_once(wb->writeback_threshold))
498 | 			wb->allow_writeback = true;
499 | 		else
500 | 			wb->allow_writeback = false;
501 | 
502 | 		old = new;
503 | 
504 | 		update_nr_empty_segs(wb);
505 | 
506 | 		schedule_timeout_interruptible(msecs_to_jiffies(intvl));
507 | 	}
508 | 	return 0;
509 | }
510 | 
511 | /*----------------------------------------------------------------------------*/
512 | 
513 | static void update_superblock_record(struct wb_device *wb)
514 | {
515 | 	struct superblock_record_device o;
516 | 	void *buf;
517 | 	struct dm_io_request io_req;
518 | 	struct dm_io_region region;
519 | 
520 | 	o.last_writeback_segment_id =
521 | 		cpu_to_le64(atomic64_read(&wb->last_writeback_segment_id));
522 | 
523 | 	buf = mempool_alloc(wb->buf_8_pool, GFP_NOIO);
524 | 	if (!buf)
525 | 		return;
526 | 
527 | 	memset(buf, 0, 8 << 9);
528 | 	memcpy(buf + (7 << 9), &o, sizeof(o));
529 | 
530 | 	io_req = (struct dm_io_request) {
531 | 		WB_IO_WRITE_FUA,
532 | 		.client = wb->io_client,
533 | 		.notify.fn = NULL,
534 | 		.mem.type = DM_IO_KMEM,
535 | 		.mem.ptr.addr = buf,
536 | 	};
537 | 	region = (struct dm_io_region) {
538 | 		.bdev = wb->cache_dev->bdev,
539 | 		.sector = (1 << 11) - 8,
540 | 		.count = 8,
541 | 	};
542 | 	wb_io(&io_req, 1, &region, NULL, false);
543 | 
544 | 	mempool_free(buf, wb->buf_8_pool);
545 | }
546 | 
547 | int sb_record_updater_proc(void *data)
548 | {
549 | 	struct wb_device *wb = data;
550 | 
551 | 	unsigned long intvl;
552 | 
553 | 	while (!kthread_should_stop()) {
554 | 		/* sec -> ms */
555 | 		intvl = read_once(wb->update_sb_record_interval) * 1000;
556 | 
557 | 		if (!intvl) {
558 | 			schedule_timeout_interruptible(msecs_to_jiffies(1000));
559 | 			continue;
560 | 		}
561 | 
562 | 		update_superblock_record(wb);
563 | 		schedule_timeout_interruptible(msecs_to_jiffies(intvl));
564 | 	}
565 | 	return 0;
566 | }
567 | 
568 | /*----------------------------------------------------------------------------*/
569 | 
570 | int data_synchronizer_proc(void *data)
571 | {
572 | 	struct wb_device *wb = data;
573 | 	unsigned long intvl;
574 | 
575 | 	while (!kthread_should_stop()) {
576 | 		/* sec -> ms */
577 | 		intvl = read_once(wb->sync_data_interval) * 1000;
578 | 
579 | 		if (!intvl) {
580 | 			schedule_timeout_interruptible(msecs_to_jiffies(1000));
581 | 			continue;
582 | 		}
583 | 
584 | 		flush_current_buffer(wb);
585 | 		dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO);
586 | 		schedule_timeout_interruptible(msecs_to_jiffies(intvl));
587 | 	}
588 | 	return 0;
589 | }
590 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/src/dm-writeboost-metadata.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * This file is part of dm-writeboost
   3 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
   4 |  *
   5 |  * This program is free software; you can redistribute it and/or modify
   6 |  * it under the terms of the GNU General Public License as published by
   7 |  * the Free Software Foundation; either version 2 of the License, or
   8 |  * (at your option) any later version.
   9 |  *
  10 |  * This program is distributed in the hope that it will be useful,
  11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 |  * GNU General Public License for more details.
  14 |  *
  15 |  * You should have received a copy of the GNU General Public License along
  16 |  * with this program; if not, write to the Free Software Foundation, Inc.,
  17 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 |  */
  19 | 
  20 | #include "dm-writeboost.h"
  21 | #include "dm-writeboost-metadata.h"
  22 | #include "dm-writeboost-daemon.h"
  23 | 
  24 | /*----------------------------------------------------------------------------*/
  25 | 
  26 | struct large_array {
  27 | 	u64 nr_elems;
  28 | 	u32 elemsize;
  29 | 	void *data;
  30 | };
  31 | 
  32 | static struct large_array *large_array_alloc(u32 elemsize, u64 nr_elems)
  33 | {
  34 | 	struct large_array *arr = kmalloc(sizeof(*arr), GFP_KERNEL);
  35 | 	if (!arr) {
  36 | 		DMERR("Failed to allocate arr");
  37 | 		return NULL;
  38 | 	}
  39 | 
  40 | 	arr->elemsize = elemsize;
  41 | 	arr->nr_elems = nr_elems;
  42 | 
  43 | 	arr->data = vmalloc(elemsize * nr_elems);
  44 | 	if (!arr->data) {
  45 | 		DMERR("Failed to allocate data");
  46 | 		goto bad_alloc_data;
  47 | 	}
  48 | 
  49 | 	return arr;
  50 | 
  51 | bad_alloc_data:
  52 | 	kfree(arr);
  53 | 	return NULL;
  54 | }
  55 | 
  56 | static void large_array_free(struct large_array *arr)
  57 | {
  58 | 	vfree(arr->data);
  59 | 	kfree(arr);
  60 | }
  61 | 
  62 | static void *large_array_at(struct large_array *arr, u64 i)
  63 | {
  64 | 	return arr->data + arr->elemsize * i;
  65 | }
  66 | 
  67 | /*----------------------------------------------------------------------------*/
  68 | 
  69 | /*
  70 |  * Get the in-core metablock of the given index.
  71 |  */
  72 | static struct metablock *mb_at(struct wb_device *wb, u32 idx)
  73 | {
  74 | 	u32 idx_inseg;
  75 | 	u32 seg_idx = div_u64_rem(idx, wb->nr_caches_inseg, &idx_inseg);
  76 | 	struct segment_header *seg = large_array_at(wb->segment_header_array, seg_idx);
  77 | 	return seg->mb_array + idx_inseg;
  78 | }
  79 | 
  80 | static void mb_array_empty_init(struct wb_device *wb)
  81 | {
  82 | 	u32 i;
  83 | 	for (i = 0; i < wb->nr_caches; i++) {
  84 | 		struct metablock *mb = mb_at(wb, i);
  85 | 		INIT_HLIST_NODE(&mb->ht_list);
  86 | 
  87 | 		mb->idx = i;
  88 | 		mb->dirtiness.data_bits = 0;
  89 | 		mb->dirtiness.is_dirty = false;
  90 | 	}
  91 | }
  92 | 
  93 | /*
  94 |  * Calc the starting sector of the k-th segment
  95 |  */
  96 | static sector_t calc_segment_header_start(struct wb_device *wb, u32 k)
  97 | {
  98 | 	return (1 << 11) + (1 << SEGMENT_SIZE_ORDER) * k;
  99 | }
 100 | 
 101 | static u32 calc_nr_segments(struct dm_dev *dev, struct wb_device *wb)
 102 | {
 103 | 	sector_t devsize = dm_devsize(dev);
 104 | 	return div_u64(devsize - (1 << 11), 1 << SEGMENT_SIZE_ORDER);
 105 | }
 106 | 
 107 | /*
 108 |  * Get the relative index in a segment of the mb_idx-th metablock
 109 |  */
 110 | u8 mb_idx_inseg(struct wb_device *wb, u32 mb_idx)
 111 | {
 112 | 	u32 tmp32;
 113 | 	div_u64_rem(mb_idx, wb->nr_caches_inseg, &tmp32);
 114 | 	return tmp32;
 115 | }
 116 | 
 117 | /*
 118 |  * Calc the starting sector of the mb_idx-th cache block
 119 |  */
 120 | sector_t calc_mb_start_sector(struct wb_device *wb, struct segment_header *seg, u32 mb_idx)
 121 | {
 122 | 	return seg->start_sector + ((1 + mb_idx_inseg(wb, mb_idx)) << 3);
 123 | }
 124 | 
 125 | /*
 126 |  * Get the segment that contains the passed mb
 127 |  */
 128 | struct segment_header *mb_to_seg(struct wb_device *wb, struct metablock *mb)
 129 | {
 130 | 	struct segment_header *seg;
 131 | 	seg = ((void *) mb)
 132 | 	      - mb_idx_inseg(wb, mb->idx) * sizeof(struct metablock)
 133 | 	      - sizeof(struct segment_header);
 134 | 	return seg;
 135 | }
 136 | 
 137 | bool is_on_buffer(struct wb_device *wb, u32 mb_idx)
 138 | {
 139 | 	u32 start = wb->current_seg->start_idx;
 140 | 	if (mb_idx < start)
 141 | 		return false;
 142 | 
 143 | 	if (mb_idx >= (start + wb->nr_caches_inseg))
 144 | 		return false;
 145 | 
 146 | 	return true;
 147 | }
 148 | 
 149 | static u32 segment_id_to_idx(struct wb_device *wb, u64 id)
 150 | {
 151 | 	u32 idx;
 152 | 	div_u64_rem(id - 1, wb->nr_segments, &idx);
 153 | 	return idx;
 154 | }
 155 | 
 156 | static struct segment_header *segment_at(struct wb_device *wb, u32 k)
 157 | {
 158 | 	return large_array_at(wb->segment_header_array, k);
 159 | }
 160 | 
 161 | /*
 162 |  * Get the segment from the segment id.
 163 |  * The index of the segment is calculated from the segment id.
 164 |  */
 165 | struct segment_header *get_segment_header_by_id(struct wb_device *wb, u64 id)
 166 | {
 167 | 	return segment_at(wb, segment_id_to_idx(wb, id));
 168 | }
 169 | 
 170 | /*----------------------------------------------------------------------------*/
 171 | 
 172 | static int init_segment_header_array(struct wb_device *wb)
 173 | {
 174 | 	u32 segment_idx;
 175 | 
 176 | 	wb->segment_header_array = large_array_alloc(
 177 | 			sizeof(struct segment_header) +
 178 | 			sizeof(struct metablock) * wb->nr_caches_inseg,
 179 | 			wb->nr_segments);
 180 | 	if (!wb->segment_header_array) {
 181 | 		DMERR("Failed to allocate segment_header_array");
 182 | 		return -ENOMEM;
 183 | 	}
 184 | 
 185 | 	for (segment_idx = 0; segment_idx < wb->nr_segments; segment_idx++) {
 186 | 		struct segment_header *seg = large_array_at(wb->segment_header_array, segment_idx);
 187 | 
 188 | 		seg->id = 0;
 189 | 		seg->length = 0;
 190 | 		atomic_set(&seg->nr_inflight_ios, 0);
 191 | 
 192 | 		/* Const values */
 193 | 		seg->start_idx = wb->nr_caches_inseg * segment_idx;
 194 | 		seg->start_sector = calc_segment_header_start(wb, segment_idx);
 195 | 	}
 196 | 
 197 | 	mb_array_empty_init(wb);
 198 | 
 199 | 	return 0;
 200 | }
 201 | 
 202 | static void free_segment_header_array(struct wb_device *wb)
 203 | {
 204 | 	large_array_free(wb->segment_header_array);
 205 | }
 206 | 
 207 | /*----------------------------------------------------------------------------*/
 208 | 
 209 | struct ht_head {
 210 | 	struct hlist_head ht_list;
 211 | };
 212 | 
 213 | static int ht_empty_init(struct wb_device *wb)
 214 | {
 215 | 	u32 idx;
 216 | 	size_t i, nr_heads;
 217 | 	struct large_array *arr;
 218 | 
 219 | 	wb->htsize = wb->nr_caches;
 220 | 	nr_heads = wb->htsize + 1;
 221 | 	arr = large_array_alloc(sizeof(struct ht_head), nr_heads);
 222 | 	if (!arr) {
 223 | 		DMERR("Failed to allocate htable");
 224 | 		return -ENOMEM;
 225 | 	}
 226 | 
 227 | 	wb->htable = arr;
 228 | 
 229 | 	for (i = 0; i < nr_heads; i++) {
 230 | 		struct ht_head *hd = large_array_at(arr, i);
 231 | 		INIT_HLIST_HEAD(&hd->ht_list);
 232 | 	}
 233 | 
 234 | 	wb->null_head = large_array_at(wb->htable, wb->htsize);
 235 | 
 236 | 	for (idx = 0; idx < wb->nr_caches; idx++) {
 237 | 		struct metablock *mb = mb_at(wb, idx);
 238 | 		hlist_add_head(&mb->ht_list, &wb->null_head->ht_list);
 239 | 	}
 240 | 
 241 | 	return 0;
 242 | }
 243 | 
 244 | static void free_ht(struct wb_device *wb)
 245 | {
 246 | 	large_array_free(wb->htable);
 247 | }
 248 | 
 249 | struct ht_head *ht_get_head(struct wb_device *wb, struct lookup_key *key)
 250 | {
 251 | 	u32 idx;
 252 | 	div_u64_rem(key->sector >> 3, wb->htsize, &idx);
 253 | 	return large_array_at(wb->htable, idx);
 254 | }
 255 | 
 256 | static bool mb_hit(struct metablock *mb, struct lookup_key *key)
 257 | {
 258 | 	return mb->sector == key->sector;
 259 | }
 260 | 
 261 | /*
 262 |  * Remove the metablock from the hashtable and link the orphan to the null head.
 263 |  */
 264 | void ht_del(struct wb_device *wb, struct metablock *mb)
 265 | {
 266 | 	struct ht_head *null_head;
 267 | 
 268 | 	hlist_del(&mb->ht_list);
 269 | 
 270 | 	null_head = wb->null_head;
 271 | 	hlist_add_head(&mb->ht_list, &null_head->ht_list);
 272 | }
 273 | 
 274 | void ht_register(struct wb_device *wb, struct ht_head *head,
 275 | 		 struct metablock *mb, struct lookup_key *key)
 276 | {
 277 | 	hlist_del(&mb->ht_list);
 278 | 	hlist_add_head(&mb->ht_list, &head->ht_list);
 279 | 
 280 | 	BUG_ON(key->sector & 7); // should be 4KB aligned
 281 | 	mb->sector = key->sector;
 282 | };
 283 | 
 284 | struct metablock *ht_lookup(struct wb_device *wb, struct ht_head *head,
 285 | 			    struct lookup_key *key)
 286 | {
 287 | 	struct metablock *mb, *found = NULL;
 288 | 	hlist_for_each_entry(mb, &head->ht_list, ht_list) {
 289 | 		if (mb_hit(mb, key)) {
 290 | 			found = mb;
 291 | 			break;
 292 | 		}
 293 | 	}
 294 | 	return found;
 295 | }
 296 | 
 297 | /*
 298 |  * Remove all the metablock in the segment from the lookup table.
 299 |  */
 300 | void discard_caches_inseg(struct wb_device *wb, struct segment_header *seg)
 301 | {
 302 | 	u8 i;
 303 | 	for (i = 0; i < wb->nr_caches_inseg; i++) {
 304 | 		struct metablock *mb = seg->mb_array + i;
 305 | 		ht_del(wb, mb);
 306 | 	}
 307 | }
 308 | 
 309 | /*----------------------------------------------------------------------------*/
 310 | 
 311 | static int read_superblock_header(struct superblock_header_device *sup,
 312 | 				  struct wb_device *wb)
 313 | {
 314 | 	int err = 0;
 315 | 	struct dm_io_request io_req_sup;
 316 | 	struct dm_io_region region_sup;
 317 | 
 318 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 319 | 	if (!buf)
 320 | 		return -ENOMEM;
 321 | 	check_buffer_alignment(buf);
 322 | 
 323 | 	io_req_sup = (struct dm_io_request) {
 324 | 		WB_IO_READ,
 325 | 		.client = wb->io_client,
 326 | 		.notify.fn = NULL,
 327 | 		.mem.type = DM_IO_KMEM,
 328 | 		.mem.ptr.addr = buf,
 329 | 	};
 330 | 	region_sup = (struct dm_io_region) {
 331 | 		.bdev = wb->cache_dev->bdev,
 332 | 		.sector = 0,
 333 | 		.count = 8,
 334 | 	};
 335 | 	err = wb_io(&io_req_sup, 1, &region_sup, NULL, false);
 336 | 	if (err)
 337 | 		goto bad_io;
 338 | 
 339 | 	memcpy(sup, buf, sizeof(*sup));
 340 | 
 341 | bad_io:
 342 | 	mempool_free(buf, wb->buf_8_pool);
 343 | 	return err;
 344 | }
 345 | 
 346 | /*
 347 |  * check if the cache device is already formatted.
 348 |  * returns 0 iff this routine runs without failure.
 349 |  */
 350 | static int audit_cache_device(struct wb_device *wb)
 351 | {
 352 | 	int err = 0;
 353 | 	struct superblock_header_device sup;
 354 | 	err = read_superblock_header(&sup, wb);
 355 | 	if (err) {
 356 | 		DMERR("read_superblock_header failed");
 357 | 		return err;
 358 | 	}
 359 | 
 360 | 	wb->do_format = false;
 361 | 	if (le32_to_cpu(sup.magic) != WB_MAGIC ||
 362 | 	    wb->write_around_mode) { /* write-around mode should discard all caches */
 363 | 		wb->do_format = true;
 364 | 		DMERR("Superblock Header: Magic number invalid");
 365 | 		return 0;
 366 | 	}
 367 | 
 368 | 	return err;
 369 | }
 370 | 
 371 | static int format_superblock_header(struct wb_device *wb)
 372 | {
 373 | 	int err = 0;
 374 | 
 375 | 	struct dm_io_request io_req_sup;
 376 | 	struct dm_io_region region_sup;
 377 | 
 378 | 	struct superblock_header_device sup = {
 379 | 		.magic = cpu_to_le32(WB_MAGIC),
 380 | 	};
 381 | 
 382 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 383 | 	if (!buf)
 384 | 		return -ENOMEM;
 385 | 
 386 | 	memset(buf, 0, 8 << 9);
 387 | 	memcpy(buf, &sup, sizeof(sup));
 388 | 
 389 | 	io_req_sup = (struct dm_io_request) {
 390 | 		WB_IO_WRITE_FUA,
 391 | 		.client = wb->io_client,
 392 | 		.notify.fn = NULL,
 393 | 		.mem.type = DM_IO_KMEM,
 394 | 		.mem.ptr.addr = buf,
 395 | 	};
 396 | 	region_sup = (struct dm_io_region) {
 397 | 		.bdev = wb->cache_dev->bdev,
 398 | 		.sector = 0,
 399 | 		.count = 8,
 400 | 	};
 401 | 	err = wb_io(&io_req_sup, 1, &region_sup, NULL, false);
 402 | 	if (err)
 403 | 		goto bad_io;
 404 | 
 405 | bad_io:
 406 | 	mempool_free(buf, wb->buf_8_pool);
 407 | 	return err;
 408 | }
 409 | 
 410 | struct format_segmd_context {
 411 | 	int err;
 412 | 	atomic64_t count;
 413 | };
 414 | 
 415 | static void format_segmd_endio(unsigned long error, void *__context)
 416 | {
 417 | 	struct format_segmd_context *context = __context;
 418 | 	if (error)
 419 | 		context->err = 1;
 420 | 	atomic64_dec(&context->count);
 421 | }
 422 | 
 423 | struct zeroing_context {
 424 | 	int error;
 425 | 	struct completion complete;
 426 | };
 427 | 
 428 | static void zeroing_complete(int read_err, unsigned long write_err, void *context)
 429 | {
 430 | 	struct zeroing_context *zc = context;
 431 | 	if (read_err || write_err)
 432 | 		zc->error = -EIO;
 433 | 	complete(&zc->complete);
 434 | }
 435 | 
 436 | /*
 437 |  * Synchronously zeroes out a region on a device.
 438 |  */
 439 | static int do_zeroing_region(struct wb_device *wb, struct dm_io_region *region)
 440 | {
 441 | 	struct zeroing_context zc;
 442 | 	zc.error = 0;
 443 | 	init_completion(&zc.complete);
 444 | 	dm_kcopyd_zero(wb->copier, 1, region, 0, zeroing_complete, &zc);
 445 | 	wait_for_completion(&zc.complete);
 446 | 	return zc.error;
 447 | }
 448 | 
 449 | static int zeroing_full_superblock(struct wb_device *wb)
 450 | {
 451 | 	struct dm_io_region region = {
 452 | 		.bdev = wb->cache_dev->bdev,
 453 | 		.sector = 0,
 454 | 		.count = 1 << 11,
 455 | 	};
 456 | 	return do_zeroing_region(wb, &region);
 457 | }
 458 | 
 459 | static int format_all_segment_headers(struct wb_device *wb)
 460 | {
 461 | 	int err = 0;
 462 | 	struct dm_dev *dev = wb->cache_dev;
 463 | 	u32 i;
 464 | 
 465 | 	struct format_segmd_context context;
 466 | 
 467 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 468 | 	if (!buf)
 469 | 		return -ENOMEM;
 470 | 
 471 | 	memset(buf, 0, 8 << 9);
 472 | 	check_buffer_alignment(buf);
 473 | 
 474 | 	atomic64_set(&context.count, wb->nr_segments);
 475 | 	context.err = 0;
 476 | 
 477 | 	/* Submit all the writes asynchronously. */
 478 | 	for (i = 0; i < wb->nr_segments; i++) {
 479 | 		struct dm_io_request io_req_seg = {
 480 | 			WB_IO_WRITE,
 481 | 			.client = wb->io_client,
 482 | 			.notify.fn = format_segmd_endio,
 483 | 			.notify.context = &context,
 484 | 			.mem.type = DM_IO_KMEM,
 485 | 			.mem.ptr.addr = buf,
 486 | 		};
 487 | 		struct dm_io_region region_seg = {
 488 | 			.bdev = dev->bdev,
 489 | 			.sector = calc_segment_header_start(wb, i),
 490 | 			.count = (1 << 3),
 491 | 		};
 492 | 		err = wb_io(&io_req_seg, 1, &region_seg, NULL, false);
 493 | 		if (err)
 494 | 			break;
 495 | 	}
 496 | 
 497 | 	if (err)
 498 | 		goto bad;
 499 | 
 500 | 	/* Wait for all the writes complete. */
 501 | 	while (atomic64_read(&context.count))
 502 | 		schedule_timeout_interruptible(msecs_to_jiffies(100));
 503 | 
 504 | 	if (context.err) {
 505 | 		DMERR("I/O failed");
 506 | 		err = -EIO;
 507 | 		goto bad;
 508 | 	}
 509 | 
 510 | 	err = dm_blkdev_issue_flush(dev->bdev, GFP_KERNEL);
 511 | 
 512 | bad:
 513 | 	mempool_free(buf, wb->buf_8_pool);
 514 | 	return err;
 515 | }
 516 | 
 517 | /*
 518 |  * Format superblock header and all the segment headers in a cache device
 519 |  */
 520 | static int format_cache_device(struct wb_device *wb)
 521 | {
 522 | 	int err = zeroing_full_superblock(wb);
 523 | 	if (err) {
 524 | 		DMERR("zeroing_full_superblock failed");
 525 | 		return err;
 526 | 	}
 527 | 	err = format_all_segment_headers(wb);
 528 | 	if (err) {
 529 | 		DMERR("format_all_segment_headers failed");
 530 | 		return err;
 531 | 	}
 532 | 	err = format_superblock_header(wb); /* First 512B */
 533 | 	if (err) {
 534 | 		DMERR("format_superblock_header failed");
 535 | 		return err;
 536 | 	}
 537 | 	return err;
 538 | }
 539 | 
 540 | /*
 541 |  * First check if the superblock and the passed arguments are consistent and
 542 |  * re-format the cache structure if they are not.
 543 |  * If you want to re-format the cache device you must zeroes out the first one
 544 |  * sector of the device.
 545 |  */
 546 | static int might_format_cache_device(struct wb_device *wb)
 547 | {
 548 | 	int err = 0;
 549 | 
 550 | 	err = audit_cache_device(wb);
 551 | 	if (err) {
 552 | 		DMERR("audit_cache_device failed");
 553 | 		return err;
 554 | 	}
 555 | 
 556 | 	if (wb->do_format) {
 557 | 		err = format_cache_device(wb);
 558 | 		if (err) {
 559 | 			DMERR("format_cache_device failed");
 560 | 			return err;
 561 | 		}
 562 | 	}
 563 | 
 564 | 	return err;
 565 | }
 566 | 
 567 | /*----------------------------------------------------------------------------*/
 568 | 
 569 | static int init_rambuf_pool(struct wb_device *wb)
 570 | {
 571 | 	int err = 0;
 572 | 	size_t i;
 573 | 
 574 | 	wb->rambuf_pool = kmalloc(sizeof(struct rambuffer) * NR_RAMBUF_POOL, GFP_KERNEL);
 575 | 	if (!wb->rambuf_pool)
 576 | 		return -ENOMEM;
 577 | 
 578 | 	for (i = 0; i < NR_RAMBUF_POOL; i++) {
 579 | 		void *alloced = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9));
 580 | 		if (!alloced) {
 581 | 			size_t j;
 582 | 			DMERR("Failed to allocate rambuf->data");
 583 | 			for (j = 0; j < i; j++) {
 584 | 				vfree(wb->rambuf_pool[j].data);
 585 | 			}
 586 | 			err = -ENOMEM;
 587 | 			goto bad_alloc_data;
 588 | 		}
 589 | 		wb->rambuf_pool[i].data = alloced;
 590 | 	}
 591 | 
 592 | 	return err;
 593 | 
 594 | bad_alloc_data:
 595 | 	kfree(wb->rambuf_pool);
 596 | 	return err;
 597 | }
 598 | 
 599 | static void free_rambuf_pool(struct wb_device *wb)
 600 | {
 601 | 	size_t i;
 602 | 	for (i = 0; i < NR_RAMBUF_POOL; i++)
 603 | 		vfree(wb->rambuf_pool[i].data);
 604 | 	kfree(wb->rambuf_pool);
 605 | }
 606 | 
 607 | struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id)
 608 | {
 609 | 	u32 tmp32;
 610 | 	div_u64_rem(id - 1, NR_RAMBUF_POOL, &tmp32);
 611 | 	return wb->rambuf_pool + tmp32;
 612 | }
 613 | 
 614 | /*----------------------------------------------------------------------------*/
 615 | 
 616 | /*
 617 |  * Initialize core devices
 618 |  * - Cache device (SSD)
 619 |  * - RAM buffers (DRAM)
 620 |  */
 621 | static int init_devices(struct wb_device *wb)
 622 | {
 623 | 	int err = 0;
 624 | 
 625 | 	err = might_format_cache_device(wb);
 626 | 	if (err)
 627 | 		return err;
 628 | 
 629 | 	err = init_rambuf_pool(wb);
 630 | 	if (err) {
 631 | 		DMERR("init_rambuf_pool failed");
 632 | 		return err;
 633 | 	}
 634 | 
 635 | 	return err;
 636 | }
 637 | 
 638 | static void free_devices(struct wb_device *wb)
 639 | {
 640 | 	free_rambuf_pool(wb);
 641 | }
 642 | 
 643 | /*----------------------------------------------------------------------------*/
 644 | 
 645 | static int read_superblock_record(struct superblock_record_device *record,
 646 | 				  struct wb_device *wb)
 647 | {
 648 | 	int err = 0;
 649 | 	struct dm_io_request io_req;
 650 | 	struct dm_io_region region;
 651 | 
 652 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 653 | 	if (!buf)
 654 | 		return -ENOMEM;
 655 | 
 656 | 	check_buffer_alignment(buf);
 657 | 
 658 | 	io_req = (struct dm_io_request) {
 659 | 		WB_IO_READ,
 660 | 		.client = wb->io_client,
 661 | 		.notify.fn = NULL,
 662 | 		.mem.type = DM_IO_KMEM,
 663 | 		.mem.ptr.addr = buf,
 664 | 	};
 665 | 	region = (struct dm_io_region) {
 666 | 		.bdev = wb->cache_dev->bdev,
 667 | 		.sector = (1 << 11) - 8,
 668 | 		.count = 8,
 669 | 	};
 670 | 	err = wb_io(&io_req, 1, &region, NULL, false);
 671 | 	if (err)
 672 | 		goto bad_io;
 673 | 
 674 | 	memcpy(record, buf + (7 << 9), sizeof(*record));
 675 | 
 676 | bad_io:
 677 | 	mempool_free(buf, wb->buf_8_pool);
 678 | 	return err;
 679 | }
 680 | 
 681 | /*
 682 |  * Read out whole segment of @seg to a pre-allocated @buf
 683 |  */
 684 | static int read_whole_segment(void *buf, struct wb_device *wb,
 685 | 			      struct segment_header *seg)
 686 | {
 687 | 	struct dm_io_request io_req = {
 688 | 		WB_IO_READ,
 689 | 		.client = wb->io_client,
 690 | 		.notify.fn = NULL,
 691 | 		.mem.type = DM_IO_VMA,
 692 | 		.mem.ptr.addr = buf,
 693 | 	};
 694 | 	struct dm_io_region region = {
 695 | 		.bdev = wb->cache_dev->bdev,
 696 | 		.sector = seg->start_sector,
 697 | 		.count = 1 << SEGMENT_SIZE_ORDER,
 698 | 	};
 699 | 	return wb_io(&io_req, 1, &region, NULL, false);
 700 | }
 701 | 
 702 | /*
 703 |  * We make a checksum of a segment from the valid data in a segment except the
 704 |  * first 1 sector.
 705 |  */
 706 | u32 calc_checksum(void *rambuffer, u8 length)
 707 | {
 708 | 	unsigned int len = (4096 - 512) + 4096 * length;
 709 | 	return ~crc32c(0xffffffff, rambuffer + 512, len);
 710 | }
 711 | 
 712 | void prepare_segment_header_device(void *rambuffer,
 713 | 				   struct wb_device *wb,
 714 | 				   struct segment_header *src)
 715 | {
 716 | 	struct segment_header_device *dest = rambuffer;
 717 | 	u32 i;
 718 | 
 719 | 	ASSERT((src->length) == (wb->cursor - src->start_idx));
 720 | 
 721 | 	for (i = 0; i < src->length; i++) {
 722 | 		struct metablock *mb = src->mb_array + i;
 723 | 		struct metablock_device *mbdev = dest->mbarr + i;
 724 | 
 725 | 		mbdev->sector = cpu_to_le64((u64)mb->sector);
 726 | 		mbdev->dirty_bits = mb->dirtiness.is_dirty ? mb->dirtiness.data_bits : 0;
 727 | 	}
 728 | 
 729 | 	dest->id = cpu_to_le64(src->id);
 730 | 	dest->length = src->length;
 731 | 	dest->checksum = cpu_to_le32(calc_checksum(rambuffer, src->length));
 732 | }
 733 | 
 734 | /*----------------------------------------------------------------------------*/
 735 | 
 736 | /*
 737 |  * Apply @i-th metablock in @src to @seg
 738 |  */
 739 | static int apply_metablock_device(struct wb_device *wb, struct segment_header *seg,
 740 | 				  struct segment_header_device *src, u8 i)
 741 | {
 742 | 	struct lookup_key key;
 743 | 	struct ht_head *head;
 744 | 	struct metablock *found = NULL, *mb = seg->mb_array + i;
 745 | 	struct metablock_device *mbdev = src->mbarr + i;
 746 | 
 747 | 	mb->sector = le64_to_cpu(mbdev->sector);
 748 | 
 749 | 	mb->dirtiness.data_bits = mbdev->dirty_bits ? mbdev->dirty_bits : 255;
 750 | 	mb->dirtiness.is_dirty = mbdev->dirty_bits ? true : false;
 751 | 
 752 | 	key = (struct lookup_key) {
 753 | 		.sector = mb->sector,
 754 | 	};
 755 | 	head = ht_get_head(wb, &key);
 756 | 	found = ht_lookup(wb, head, &key);
 757 | 	if (found) {
 758 | 		int err = 0;
 759 | 		u8 i;
 760 | 		struct write_io wio;
 761 | 		void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 762 | 		if (!buf)
 763 | 			return -ENOMEM;
 764 | 
 765 | 		wio = (struct write_io) {
 766 | 			.data = buf,
 767 | 			.data_bits = 0,
 768 | 		};
 769 | 		err = prepare_overwrite(wb, mb_to_seg(wb, found), found, &wio, mb->dirtiness.data_bits);
 770 | 		if (err)
 771 | 			goto fail_out;
 772 | 
 773 | 		for (i = 0; i < 8; i++) {
 774 | 			struct dm_io_request io_req;
 775 | 			struct dm_io_region region;
 776 | 			if (!(wio.data_bits & (1 << i)))
 777 | 				continue;
 778 | 
 779 | 			io_req = (struct dm_io_request) {
 780 | 				WB_IO_WRITE,
 781 | 				.client = wb->io_client,
 782 | 				.notify.fn = NULL,
 783 | 				.mem.type = DM_IO_KMEM,
 784 | 				.mem.ptr.addr = wio.data + (i << 9),
 785 | 			};
 786 | 			region = (struct dm_io_region) {
 787 | 				.bdev = wb->backing_dev->bdev,
 788 | 				.sector = mb->sector + i,
 789 | 				.count = 1,
 790 | 			};
 791 | 			err = wb_io(&io_req, 1, &region, NULL, true);
 792 | 			if (err)
 793 | 				break;
 794 | 		}
 795 | 
 796 | fail_out:
 797 | 		mempool_free(buf, wb->buf_8_pool);
 798 | 		if (err)
 799 | 			return err;
 800 | 	}
 801 | 
 802 | 	ht_register(wb, head, mb, &key);
 803 | 
 804 | 	if (mb->dirtiness.is_dirty)
 805 | 		inc_nr_dirty_caches(wb);
 806 | 
 807 | 	return 0;
 808 | }
 809 | 
 810 | static int apply_segment_header_device(struct wb_device *wb, struct segment_header *seg,
 811 | 				       struct segment_header_device *src)
 812 | {
 813 | 	int err = 0;
 814 | 	u8 i;
 815 | 	seg->length = src->length;
 816 | 	for (i = 0; i < src->length; i++) {
 817 | 		err = apply_metablock_device(wb, seg, src, i);
 818 | 		if (err)
 819 | 			break;
 820 | 	}
 821 | 	return err;
 822 | }
 823 | 
 824 | /*
 825 |  * Read out only segment header (4KB) of @seg to @buf
 826 |  */
 827 | static int read_segment_header(void *buf, struct wb_device *wb,
 828 | 			       struct segment_header *seg)
 829 | {
 830 | 	struct dm_io_request io_req = {
 831 | 		WB_IO_READ,
 832 | 		.client = wb->io_client,
 833 | 		.notify.fn = NULL,
 834 | 		.mem.type = DM_IO_KMEM,
 835 | 		.mem.ptr.addr = buf,
 836 | 	};
 837 | 	struct dm_io_region region = {
 838 | 		.bdev = wb->cache_dev->bdev,
 839 | 		.sector = seg->start_sector,
 840 | 		.count = 8,
 841 | 	};
 842 | 	return wb_io(&io_req, 1, &region, NULL, false);
 843 | }
 844 | 
 845 | /*
 846 |  * Find the max id from all the segment headers
 847 |  * @max_id (out) : The max id found
 848 |  */
 849 | static int do_find_max_id(struct wb_device *wb, u64 *max_id)
 850 | {
 851 | 	int err = 0;
 852 | 	u32 k;
 853 | 
 854 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL);
 855 | 	if (!buf)
 856 | 		return -ENOMEM;
 857 | 	check_buffer_alignment(buf);
 858 | 
 859 | 	*max_id = 0;
 860 | 	for (k = 0; k < wb->nr_segments; k++) {
 861 | 		struct segment_header *seg = segment_at(wb, k);
 862 | 		struct segment_header_device *header;
 863 | 		err = read_segment_header(buf, wb, seg);
 864 | 		if (err)
 865 | 			goto out;
 866 | 
 867 | 		header = buf;
 868 | 		if (le64_to_cpu(header->id) > *max_id)
 869 | 			*max_id = le64_to_cpu(header->id);
 870 | 	}
 871 | out:
 872 | 	mempool_free(buf, wb->buf_8_pool);
 873 | 	return err;
 874 | }
 875 | 
 876 | static int find_max_id(struct wb_device *wb, u64 *max_id)
 877 | {
 878 | 	/*
 879 | 	 * Fast path.
 880 | 	 * If it's the first creation, we don't need to look over
 881 | 	 * the segment headers to know that the max_id is zero.
 882 | 	 */
 883 | 	if (wb->do_format) {
 884 | 		*max_id = 0;
 885 | 		return 0;
 886 | 	}
 887 | 
 888 | 	return do_find_max_id(wb, max_id);
 889 | }
 890 | 
 891 | /*
 892 |  * Iterate over the logs on the cache device and apply (recover the cache metadata)
 893 |  * valid (checksum is correct) segments.
 894 |  * A segment is valid means that the segment was written without any failure
 895 |  * typically due to unexpected power failure.
 896 |  *
 897 |  * @max_id (in/out)
 898 |  *   - in  : The max id found in find_max_id()
 899 |  *   - out : The last id applied in this function
 900 |  */
 901 | static int do_apply_valid_segments(struct wb_device *wb, u64 *max_id)
 902 | {
 903 | 	int err = 0;
 904 | 	struct segment_header *seg;
 905 | 	struct segment_header_device *header;
 906 | 	u32 i, start_idx;
 907 | 
 908 | 	void *rambuf = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9));
 909 | 	if (!rambuf)
 910 | 		return -ENOMEM;
 911 | 
 912 | 	/*
 913 | 	 * We are starting from the segment next to the newest one, which can
 914 | 	 * be the oldest. The id can be zero if the logs didn't lap at all.
 915 | 	 */
 916 | 	start_idx = segment_id_to_idx(wb, *max_id + 1);
 917 | 	*max_id = 0;
 918 | 
 919 | 	for (i = start_idx; i < (start_idx + wb->nr_segments); i++) {
 920 | 		u32 actual, expected, k;
 921 | 		div_u64_rem(i, wb->nr_segments, &k);
 922 | 		seg = segment_at(wb, k);
 923 | 
 924 | 		err = read_whole_segment(rambuf, wb, seg);
 925 | 		if (err)
 926 | 			break;
 927 | 
 928 | 		header = rambuf;
 929 | 
 930 | 		/*
 931 | 		 * We can't break here.
 932 | 		 * Consider sequence of id [1,2,3,0,0,0]
 933 | 		 * The max_id is 3 and we start from the 4th segment.
 934 | 		 * If we break, the valid logs (1,2,3) are ignored.
 935 | 		 */
 936 | 		if (!le64_to_cpu(header->id))
 937 | 			continue;
 938 | 
 939 | 		/*
 940 | 		 * Compare the checksum
 941 | 		 * if they don't match we discard the subsequent logs.
 942 | 		 */
 943 | 		actual = calc_checksum(rambuf, header->length);
 944 | 		expected = le32_to_cpu(header->checksum);
 945 | 		if (actual != expected) {
 946 | 			DMWARN("Checksum incorrect id:%llu checksum: %u != %u",
 947 | 			       (long long unsigned int) le64_to_cpu(header->id),
 948 | 			       actual, expected);
 949 | 			break;
 950 | 		}
 951 | 
 952 | 		/* This segment is correct and we apply */
 953 | 		err = apply_segment_header_device(wb, seg, header);
 954 | 		if (err)
 955 | 			break;
 956 | 
 957 | 		*max_id = le64_to_cpu(header->id);
 958 | 	}
 959 | 
 960 | 	vfree(rambuf);
 961 | 	return err;
 962 | }
 963 | 
 964 | static int apply_valid_segments(struct wb_device *wb, u64 *max_id)
 965 | {
 966 | 	/*
 967 | 	 * Fast path.
 968 | 	 * If the max_id is zero, there is obviously no valid segments.
 969 | 	 * For the fast initialization, we quit here immediately.
 970 | 	 */
 971 | 	if (!(*max_id))
 972 | 		return 0;
 973 | 
 974 | 	return do_apply_valid_segments(wb, max_id);
 975 | }
 976 | 
 977 | static int infer_last_writeback_id(struct wb_device *wb)
 978 | {
 979 | 	int err = 0;
 980 | 
 981 | 	u64 inferred_last_writeback_id;
 982 | 	u64 record_id;
 983 | 
 984 | 	struct superblock_record_device record;
 985 | 	err = read_superblock_record(&record, wb);
 986 | 	if (err)
 987 | 		return err;
 988 | 
 989 | 	inferred_last_writeback_id =
 990 | 		SUB_ID(atomic64_read(&wb->last_flushed_segment_id), wb->nr_segments);
 991 | 
 992 | 	/*
 993 | 	 * If last_writeback_id is recorded on the super block
 994 | 	 * we can eliminate unnecessary writeback for the segments that were
 995 | 	 * written back before.
 996 | 	 */
 997 | 	record_id = le64_to_cpu(record.last_writeback_segment_id);
 998 | 	if (record_id > inferred_last_writeback_id) {
 999 | 		u64 id;
1000 | 		for (id = inferred_last_writeback_id + 1; id <= record_id; id++)
1001 | 			mark_clean_seg(wb, get_segment_header_by_id(wb, id));
1002 | 		inferred_last_writeback_id = record_id;
1003 | 	}
1004 | 
1005 | 	atomic64_set(&wb->last_writeback_segment_id, inferred_last_writeback_id);
1006 | 	return err;
1007 | }
1008 | 
1009 | /*
1010 |  * Replay all the log on the cache device to reconstruct the in-memory metadata.
1011 |  *
1012 |  * Algorithm:
1013 |  * 1. Find the maximum id
1014 |  * 2. Start from the right. iterate all the log.
1015 |  * 2. Skip if id=0 or checkum incorrect
1016 |  * 2. Apply otherwise.
1017 |  *
1018 |  * This algorithm is robust for floppy SSD that may write a segment partially
1019 |  * or lose data on its buffer on power fault.
1020 |  */
1021 | static int replay_log_on_cache(struct wb_device *wb)
1022 | {
1023 | 	int err = 0;
1024 | 
1025 | 	u64 max_id;
1026 | 	err = find_max_id(wb, &max_id);
1027 | 	if (err) {
1028 | 		DMERR("find_max_id failed");
1029 | 		return err;
1030 | 	}
1031 | 
1032 | 	err = apply_valid_segments(wb, &max_id);
1033 | 	if (err) {
1034 | 		DMERR("apply_valid_segments failed");
1035 | 		return err;
1036 | 	}
1037 | 
1038 | 	/* Setup last_flushed_segment_id */
1039 | 	atomic64_set(&wb->last_flushed_segment_id, max_id);
1040 | 
1041 | 	/* Setup last_queued_segment_id */
1042 | 	atomic64_set(&wb->last_queued_segment_id, max_id);
1043 | 
1044 | 	/* Setup last_writeback_segment_id */
1045 | 	infer_last_writeback_id(wb);
1046 | 
1047 | 	return err;
1048 | }
1049 | 
1050 | /*
1051 |  * Acquire and initialize the first segment header for our caching.
1052 |  */
1053 | static void prepare_first_seg(struct wb_device *wb)
1054 | {
1055 | 	u64 init_segment_id = atomic64_read(&wb->last_flushed_segment_id) + 1;
1056 | 	acquire_new_seg(wb, init_segment_id);
1057 | 	cursor_init(wb);
1058 | }
1059 | 
1060 | /*
1061 |  * Recover all the cache state from the persistent devices
1062 |  */
1063 | static int recover_cache(struct wb_device *wb)
1064 | {
1065 | 	int err = 0;
1066 | 
1067 | 	err = replay_log_on_cache(wb);
1068 | 	if (err) {
1069 | 		DMERR("replay_log_on_cache failed");
1070 | 		return err;
1071 | 	}
1072 | 
1073 | 	prepare_first_seg(wb);
1074 | 	return 0;
1075 | }
1076 | 
1077 | /*----------------------------------------------------------------------------*/
1078 | 
1079 | static struct writeback_segment *alloc_writeback_segment(struct wb_device *wb, gfp_t gfp)
1080 | {
1081 | 	u8 i;
1082 | 
1083 | 	struct writeback_segment *writeback_seg = kmalloc(sizeof(*writeback_seg), gfp);
1084 | 	if (!writeback_seg)
1085 | 		goto bad_writeback_seg;
1086 | 
1087 | 	writeback_seg->ios = kmalloc(wb->nr_caches_inseg * sizeof(struct writeback_io), gfp);
1088 | 	if (!writeback_seg->ios)
1089 | 		goto bad_ios;
1090 | 
1091 | 	writeback_seg->buf = vmalloc((1 << (SEGMENT_SIZE_ORDER + 9)) - (1 << 12));
1092 | 	if (!writeback_seg->buf)
1093 | 		goto bad_buf;
1094 | 
1095 | 	for (i = 0; i < wb->nr_caches_inseg; i++) {
1096 | 		struct writeback_io *writeback_io = writeback_seg->ios + i;
1097 | 		writeback_io->data = writeback_seg->buf + (i << 12);
1098 | 	}
1099 | 
1100 | 	return writeback_seg;
1101 | 
1102 | bad_buf:
1103 | 	kfree(writeback_seg->ios);
1104 | bad_ios:
1105 | 	kfree(writeback_seg);
1106 | bad_writeback_seg:
1107 | 	return NULL;
1108 | }
1109 | 
1110 | static void free_writeback_segment(struct wb_device *wb, struct writeback_segment *writeback_seg)
1111 | {
1112 | 	vfree(writeback_seg->buf);
1113 | 	kfree(writeback_seg->ios);
1114 | 	kfree(writeback_seg);
1115 | }
1116 | 
1117 | /*
1118 |  * Try to allocate new writeback buffer by the @nr_batch size.
1119 |  * On success, it frees the old buffer.
1120 |  *
1121 |  * Bad user may set # of batches that can hardly allocate.
1122 |  * This function is even robust in such case.
1123 |  */
1124 | static void free_writeback_ios(struct wb_device *wb)
1125 | {
1126 | 	size_t i;
1127 | 	for (i = 0; i < wb->nr_cur_batched_writeback; i++)
1128 | 		free_writeback_segment(wb, *(wb->writeback_segs + i));
1129 | 	kfree(wb->writeback_segs);
1130 | }
1131 | 
1132 | /*
1133 |  * Request to allocate data structures to write back @nr_batch segments.
1134 |  * Previous structures are preserved in case of failure.
1135 |  */
1136 | int try_alloc_writeback_ios(struct wb_device *wb, size_t nr_batch, gfp_t gfp)
1137 | {
1138 | 	int err = 0;
1139 | 	size_t i;
1140 | 
1141 | 	struct writeback_segment **writeback_segs = kzalloc(
1142 | 			nr_batch * sizeof(struct writeback_segment *), gfp);
1143 | 	if (!writeback_segs)
1144 | 		return -ENOMEM;
1145 | 
1146 | 	for (i = 0; i < nr_batch; i++) {
1147 | 		struct writeback_segment *alloced = alloc_writeback_segment(wb, gfp);
1148 | 		if (!alloced) {
1149 | 			size_t j;
1150 | 			for (j = 0; j < i; j++)
1151 | 				free_writeback_segment(wb, writeback_segs[j]);
1152 | 			kfree(writeback_segs);
1153 | 
1154 | 			DMERR("Failed to allocate writeback_segs");
1155 | 			return -ENOMEM;
1156 | 		}
1157 | 		writeback_segs[i] = alloced;
1158 | 	}
1159 | 
1160 | 	/*
1161 | 	 * Free old buffers if exists.
1162 | 	 * wb->writeback_segs is firstly NULL under constructor .ctr.
1163 | 	 */
1164 | 	if (wb->writeback_segs)
1165 | 		free_writeback_ios(wb);
1166 | 
1167 | 	/* And then swap by new values */
1168 | 	wb->writeback_segs = writeback_segs;
1169 | 	wb->nr_writeback_segs = nr_batch;
1170 | 
1171 | 	return err;
1172 | }
1173 | 
1174 | /*----------------------------------------------------------------------------*/
1175 | 
1176 | #define CREATE_DAEMON(name) \
1177 | 	do { \
1178 | 		wb->name = kthread_create( \
1179 | 				name##_proc, wb,  "dmwb_" #name); \
1180 | 		if (IS_ERR(wb->name)) { \
1181 | 			err = PTR_ERR(wb->name); \
1182 | 			wb->name = NULL; \
1183 | 			DMERR("couldn't spawn " #name); \
1184 | 			goto bad_##name; \
1185 | 		} \
1186 | 		wake_up_process(wb->name); \
1187 | 	} while (0)
1188 | 
1189 | /*
1190 |  * Alloc and then setup the initial state of the metadata
1191 |  *
1192 |  * Metadata:
1193 |  * - Segment header array
1194 |  * - Metablocks
1195 |  * - Hash table
1196 |  */
1197 | static int init_metadata(struct wb_device *wb)
1198 | {
1199 | 	int err = 0;
1200 | 
1201 | 	err = init_segment_header_array(wb);
1202 | 	if (err) {
1203 | 		DMERR("init_segment_header_array failed");
1204 | 		goto bad_alloc_segment_header_array;
1205 | 	}
1206 | 
1207 | 	err = ht_empty_init(wb);
1208 | 	if (err) {
1209 | 		DMERR("ht_empty_init failed");
1210 | 		goto bad_alloc_ht;
1211 | 	}
1212 | 
1213 | 	return err;
1214 | 
1215 | bad_alloc_ht:
1216 | 	free_segment_header_array(wb);
1217 | bad_alloc_segment_header_array:
1218 | 	return err;
1219 | }
1220 | 
1221 | static void free_metadata(struct wb_device *wb)
1222 | {
1223 | 	free_ht(wb);
1224 | 	free_segment_header_array(wb);
1225 | }
1226 | 
1227 | static int init_writeback_daemon(struct wb_device *wb)
1228 | {
1229 | 	int err = 0;
1230 | 	size_t nr_batch;
1231 | 
1232 | 	atomic_set(&wb->writeback_fail_count, 0);
1233 | 	atomic_set(&wb->writeback_io_count, 0);
1234 | 
1235 | 	nr_batch = 32;
1236 | 	wb->nr_max_batched_writeback = nr_batch;
1237 | 	if (try_alloc_writeback_ios(wb, nr_batch, GFP_KERNEL))
1238 | 		return -ENOMEM;
1239 | 
1240 | 	init_waitqueue_head(&wb->writeback_wait_queue);
1241 | 	init_waitqueue_head(&wb->wait_drop_caches);
1242 | 	init_waitqueue_head(&wb->writeback_io_wait_queue);
1243 | 
1244 | 	wb->allow_writeback = false;
1245 | 	wb->urge_writeback = false;
1246 | 	wb->force_drop = false;
1247 | 	CREATE_DAEMON(writeback_daemon);
1248 | 
1249 | 	return err;
1250 | 
1251 | bad_writeback_daemon:
1252 | 	free_writeback_ios(wb);
1253 | 	return err;
1254 | }
1255 | 
1256 | static int init_flush_daemon(struct wb_device *wb)
1257 | {
1258 | 	int err = 0;
1259 | 	init_waitqueue_head(&wb->flush_wait_queue);
1260 | 	CREATE_DAEMON(flush_daemon);
1261 | 	return err;
1262 | 
1263 | bad_flush_daemon:
1264 | 	return err;
1265 | }
1266 | 
1267 | static int init_flush_barrier_work(struct wb_device *wb)
1268 | {
1269 | 	wb->barrier_wq = create_singlethread_workqueue("dmwb_barrier");
1270 | 	if (!wb->barrier_wq) {
1271 | 		DMERR("Failed to allocate barrier_wq");
1272 | 		return -ENOMEM;
1273 | 	}
1274 | 	bio_list_init(&wb->barrier_ios);
1275 | 	INIT_WORK(&wb->flush_barrier_work, flush_barrier_ios);
1276 | 	return 0;
1277 | }
1278 | 
1279 | static int init_writeback_modulator(struct wb_device *wb)
1280 | {
1281 | 	int err = 0;
1282 | 	wb->writeback_threshold = 0;
1283 | 	CREATE_DAEMON(writeback_modulator);
1284 | 	return err;
1285 | 
1286 | bad_writeback_modulator:
1287 | 	return err;
1288 | }
1289 | 
1290 | static int init_sb_record_updater(struct wb_device *wb)
1291 | {
1292 | 	int err = 0;
1293 | 	wb->update_sb_record_interval = 0;
1294 | 	CREATE_DAEMON(sb_record_updater);
1295 | 	return err;
1296 | 
1297 | bad_sb_record_updater:
1298 | 	return err;
1299 | }
1300 | 
1301 | static int init_data_synchronizer(struct wb_device *wb)
1302 | {
1303 | 	int err = 0;
1304 | 	wb->sync_data_interval = 0;
1305 | 	CREATE_DAEMON(data_synchronizer);
1306 | 	return err;
1307 | 
1308 | bad_data_synchronizer:
1309 | 	return err;
1310 | }
1311 | 
1312 | int resume_cache(struct wb_device *wb)
1313 | {
1314 | 	int err = 0;
1315 | 
1316 | 	wb->nr_segments = calc_nr_segments(wb->cache_dev, wb);
1317 | 	wb->nr_caches_inseg = (1 << (SEGMENT_SIZE_ORDER - 3)) - 1;
1318 | 	wb->nr_caches = wb->nr_segments * wb->nr_caches_inseg;
1319 | 
1320 | 	err = init_devices(wb);
1321 | 	if (err)
1322 | 		goto bad_devices;
1323 | 
1324 | 	err = init_metadata(wb);
1325 | 	if (err)
1326 | 		goto bad_metadata;
1327 | 
1328 | 	err = init_writeback_daemon(wb);
1329 | 	if (err) {
1330 | 		DMERR("init_writeback_daemon failed");
1331 | 		goto bad_writeback_daemon;
1332 | 	}
1333 | 
1334 | 	err = recover_cache(wb);
1335 | 	if (err) {
1336 | 		DMERR("recover_cache failed");
1337 | 		goto bad_recover;
1338 | 	}
1339 | 
1340 | 	err = init_flush_daemon(wb);
1341 | 	if (err) {
1342 | 		DMERR("init_flush_daemon failed");
1343 | 		goto bad_flush_daemon;
1344 | 	}
1345 | 
1346 | 	err = init_flush_barrier_work(wb);
1347 | 	if (err) {
1348 | 		DMERR("init_flush_barrier_work failed");
1349 | 		goto bad_flush_barrier_work;
1350 | 	}
1351 | 
1352 | 	err = init_writeback_modulator(wb);
1353 | 	if (err) {
1354 | 		DMERR("init_writeback_modulator failed");
1355 | 		goto bad_modulator;
1356 | 	}
1357 | 
1358 | 	err = init_sb_record_updater(wb);
1359 | 	if (err) {
1360 | 		DMERR("init_sb_recorder failed");
1361 | 		goto bad_updater;
1362 | 	}
1363 | 
1364 | 	err = init_data_synchronizer(wb);
1365 | 	if (err) {
1366 | 		DMERR("init_data_synchronizer failed");
1367 | 		goto bad_synchronizer;
1368 | 	}
1369 | 
1370 | 	return err;
1371 | 
1372 | bad_synchronizer:
1373 | 	kthread_stop(wb->sb_record_updater);
1374 | bad_updater:
1375 | 	kthread_stop(wb->writeback_modulator);
1376 | bad_modulator:
1377 | 	destroy_workqueue(wb->barrier_wq);
1378 | bad_flush_barrier_work:
1379 | 	kthread_stop(wb->flush_daemon);
1380 | bad_flush_daemon:
1381 | bad_recover:
1382 | 	kthread_stop(wb->writeback_daemon);
1383 | 	free_writeback_ios(wb);
1384 | bad_writeback_daemon:
1385 | 	free_metadata(wb);
1386 | bad_metadata:
1387 | 	free_devices(wb);
1388 | bad_devices:
1389 | 	return err;
1390 | }
1391 | 
1392 | void free_cache(struct wb_device *wb)
1393 | {
1394 | 	/*
1395 | 	 * kthread_stop() wakes up the thread.
1396 | 	 * So we don't need to wake them up by ourselves.
1397 | 	 */
1398 | 	kthread_stop(wb->data_synchronizer);
1399 | 	kthread_stop(wb->sb_record_updater);
1400 | 	kthread_stop(wb->writeback_modulator);
1401 | 
1402 | 	destroy_workqueue(wb->barrier_wq);
1403 | 
1404 | 	kthread_stop(wb->flush_daemon);
1405 | 
1406 | 	kthread_stop(wb->writeback_daemon);
1407 | 	free_writeback_ios(wb);
1408 | 
1409 | 	free_metadata(wb);
1410 | 
1411 | 	free_devices(wb);
1412 | }
1413 | 


--------------------------------------------------------------------------------
/src/dm-writeboost-target.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * dm-writeboost
   3 |  * Log-structured Caching for Linux
   4 |  *
   5 |  * This file is part of dm-writeboost
   6 |  * Copyright (C) 2012-2025 Akira Hayakawa <ruby.wktk@gmail.com>
   7 |  *
   8 |  * This program is free software; you can redistribute it and/or modify
   9 |  * it under the terms of the GNU General Public License as published by
  10 |  * the Free Software Foundation; either version 2 of the License, or
  11 |  * (at your option) any later version.
  12 |  *
  13 |  * This program is distributed in the hope that it will be useful,
  14 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 |  * GNU General Public License for more details.
  17 |  *
  18 |  * You should have received a copy of the GNU General Public License along
  19 |  * with this program; if not, write to the Free Software Foundation, Inc.,
  20 |  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21 |  */
  22 | 
  23 | #include "dm-writeboost.h"
  24 | #include "dm-writeboost-metadata.h"
  25 | #include "dm-writeboost-daemon.h"
  26 | 
  27 | #include "linux/sort.h"
  28 | 
  29 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,2)) || \
  30 | 	((LINUX_VERSION_CODE >= KERNEL_VERSION(6,7,11)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,8,0))) || \
  31 | 	((LINUX_VERSION_CODE >= KERNEL_VERSION(6,6,23)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,7,0))) || \
  32 | 	((LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,83)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0)))
  33 | // Linux commit 6e5f0f6383b4896c7e9b943d84b136149d0f45e9 "dm io: Support IO priority"
  34 | // added the IO priority parameter in v6.9-rc1.
  35 | #define DM_IO(arg1, arg2, arg3, arg4) dm_io(arg1, arg2, arg3, arg4, IOPRIO_DEFAULT)
  36 | #else
  37 | #define DM_IO(arg1, arg2, arg3, arg4) dm_io(arg1, arg2, arg3, arg4)
  38 | #endif
  39 | 
  40 | /*----------------------------------------------------------------------------*/
  41 | 
  42 | void do_check_buffer_alignment(void *buf, const char *name, const char *caller)
  43 | {
  44 | 	unsigned long addr = (unsigned long) buf;
  45 | 
  46 | 	if (!IS_ALIGNED(addr, 1 << 9)) {
  47 | 		DMCRIT("@%s in %s is not sector-aligned. I/O buffer must be sector-aligned.", name, caller);
  48 | 		BUG();
  49 | 	}
  50 | }
  51 | 
  52 | /*----------------------------------------------------------------------------*/
  53 | 
  54 | struct wb_io {
  55 | 	struct work_struct work;
  56 | 	int err;
  57 | 	unsigned long err_bits;
  58 | 	struct dm_io_request *io_req;
  59 | 	unsigned num_regions;
  60 | 	struct dm_io_region *regions;
  61 | };
  62 | 
  63 | static void wb_io_fn(struct work_struct *work)
  64 | {
  65 | 	struct wb_io *io = container_of(work, struct wb_io, work);
  66 | 	io->err_bits = 0;
  67 | 	io->err = DM_IO(io->io_req, io->num_regions, io->regions, &io->err_bits);
  68 | }
  69 | 
  70 | int wb_io_internal(struct wb_device *wb, struct dm_io_request *io_req,
  71 | 		   unsigned num_regions, struct dm_io_region *regions,
  72 | 		   unsigned long *err_bits, bool thread, const char *caller)
  73 | {
  74 | 	int err = 0;
  75 | 
  76 | 	if (thread) {
  77 | 		struct wb_io io = {
  78 | 			.io_req = io_req,
  79 | 			.regions = regions,
  80 | 			.num_regions = num_regions,
  81 | 		};
  82 | 		ASSERT(io_req->notify.fn == NULL);
  83 | 
  84 | 		INIT_WORK_ONSTACK(&io.work, wb_io_fn);
  85 | 		queue_work(wb->io_wq, &io.work);
  86 | 		flush_workqueue(wb->io_wq);
  87 | 		destroy_work_on_stack(&io.work); /* Pair with INIT_WORK_ONSTACK */
  88 | 
  89 | 		err = io.err;
  90 | 		if (err_bits)
  91 | 			*err_bits = io.err_bits;
  92 | 	} else {
  93 | 		err = DM_IO(io_req, num_regions, regions, err_bits);
  94 | 	}
  95 | 
  96 | 	/* err_bits can be NULL. */
  97 | 	if (err || (err_bits && *err_bits)) {
  98 | 		char buf[BDEVNAME_SIZE];
  99 | 		dev_t dev = regions->bdev->bd_dev;
 100 | 
 101 | 		unsigned long eb;
 102 | 		if (!err_bits)
 103 | 			eb = (~(unsigned long)0);
 104 | 		else
 105 | 			eb = *err_bits;
 106 | 
 107 | 		format_dev_t(buf, dev);
 108 | 		DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), %s",
 109 | 		      caller, err, eb,
 110 | 		      buf, (unsigned long long) regions->sector,
 111 | 		      req_is_write(io_req) ? "write" : "read");
 112 | 	}
 113 | 
 114 | 	return err;
 115 | }
 116 | 
 117 | sector_t dm_devsize(struct dm_dev *dev)
 118 | {
 119 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
 120 | 	return bdev_nr_sectors(dev->bdev);
 121 | #else
 122 | 	return i_size_read(dev->bdev->bd_inode) >> 9;
 123 | #endif
 124 | }
 125 | 
 126 | /*----------------------------------------------------------------------------*/
 127 | 
 128 | void bio_io_success_compat(struct bio *bio)
 129 | {
 130 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0)
 131 | 	bio->bi_status = BLK_STS_OK;
 132 | 	bio_endio(bio);
 133 | #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0)
 134 | 	bio->bi_error = 0;
 135 | 	bio_endio(bio);
 136 | #else
 137 | 	bio_endio(bio, 0);
 138 | #endif
 139 | }
 140 | 
 141 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
 142 | #define bi_sector(bio) (bio)->bi_iter.bi_sector
 143 | #else
 144 | #define bi_sector(bio) (bio)->bi_sector
 145 | #endif
 146 | 
 147 | static void bio_remap(struct bio *bio, struct dm_dev *dev, sector_t sector)
 148 | {
 149 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0)
 150 | 	bio_set_dev(bio, dev->bdev);
 151 | #else
 152 | 	bio->bi_bdev = dev->bdev;
 153 | #endif
 154 | 	bi_sector(bio) = sector;
 155 | }
 156 | 
 157 | static u8 calc_offset(sector_t sector)
 158 | {
 159 | 	u32 tmp32;
 160 | 	div_u64_rem(sector, 1 << 3, &tmp32);
 161 | 	return tmp32;
 162 | }
 163 | 
 164 | static u8 bio_calc_offset(struct bio *bio)
 165 | {
 166 | 	return calc_offset(bi_sector(bio));
 167 | }
 168 | 
 169 | static bool bio_is_fullsize(struct bio *bio)
 170 | {
 171 | 	return bio_sectors(bio) == (1 << 3);
 172 | }
 173 | 
 174 | static bool bio_is_write(struct bio *bio)
 175 | {
 176 | 	return bio_data_dir(bio) == WRITE;
 177 | }
 178 | 
 179 | /*
 180 |  * We use 4KB alignment address of original request the as the lookup key.
 181 |  */
 182 | static sector_t calc_cache_alignment(sector_t bio_sector)
 183 | {
 184 | 	return div_u64(bio_sector, 1 << 3) * (1 << 3);
 185 | }
 186 | 
 187 | /*----------------------------------------------------------------------------*/
 188 | 
 189 | /*
 190 |  * Wake up the processes on the wq if the wq is active.
 191 |  * (At least a process is waiting on it)
 192 |  * This function should only used for wq that is rarely active.
 193 |  * Otherwise ordinary wake_up() should be used instead.
 194 |  */
 195 | static void wake_up_active_wq(wait_queue_head_t *wq)
 196 | {
 197 | 	if (unlikely(waitqueue_active(wq)))
 198 | 		wake_up(wq);
 199 | }
 200 | 
 201 | /*----------------------------------------------------------------------------*/
 202 | 
 203 | static u8 count_dirty_caches_remained(struct segment_header *seg)
 204 | {
 205 | 	u8 i, count = 0;
 206 | 	struct metablock *mb;
 207 | 	for (i = 0; i < seg->length; i++) {
 208 | 		mb = seg->mb_array + i;
 209 | 		if (mb->dirtiness.is_dirty)
 210 | 			count++;
 211 | 	}
 212 | 	return count;
 213 | }
 214 | 
 215 | void inc_nr_dirty_caches(struct wb_device *wb)
 216 | {
 217 | 	ASSERT(wb);
 218 | 	atomic64_inc(&wb->nr_dirty_caches);
 219 | }
 220 | 
 221 | void dec_nr_dirty_caches(struct wb_device *wb)
 222 | {
 223 | 	ASSERT(wb);
 224 | 	if (atomic64_dec_and_test(&wb->nr_dirty_caches))
 225 | 		wake_up_interruptible(&wb->wait_drop_caches);
 226 | }
 227 | 
 228 | static bool taint_mb(struct wb_device *wb, struct metablock *mb, u8 data_bits)
 229 | {
 230 | 	unsigned long flags;
 231 | 	bool flipped = false;
 232 | 
 233 | 	ASSERT(data_bits > 0);
 234 | 	spin_lock_irqsave(&wb->mb_lock, flags);
 235 | 	if (!mb->dirtiness.is_dirty) {
 236 | 		mb->dirtiness.is_dirty = true;
 237 | 		flipped = true;
 238 | 	}
 239 | 	mb->dirtiness.data_bits |= data_bits;
 240 | 	spin_unlock_irqrestore(&wb->mb_lock, flags);
 241 | 
 242 | 	return flipped;
 243 | }
 244 | 
 245 | bool mark_clean_mb(struct wb_device *wb, struct metablock *mb)
 246 | {
 247 | 	unsigned long flags;
 248 | 	bool flipped = false;
 249 | 
 250 | 	spin_lock_irqsave(&wb->mb_lock, flags);
 251 | 	if (mb->dirtiness.is_dirty) {
 252 | 		mb->dirtiness.is_dirty = false;
 253 | 		flipped = true;
 254 | 	}
 255 | 	spin_unlock_irqrestore(&wb->mb_lock, flags);
 256 | 
 257 | 	return flipped;
 258 | }
 259 | 
 260 | /*
 261 |  * Read the dirtiness of a metablock at the moment.
 262 |  */
 263 | struct dirtiness read_mb_dirtiness(struct wb_device *wb, struct segment_header *seg,
 264 | 				   struct metablock *mb)
 265 | {
 266 | 	unsigned long flags;
 267 | 	struct dirtiness retval;
 268 | 
 269 | 	spin_lock_irqsave(&wb->mb_lock, flags);
 270 | 	retval = mb->dirtiness;
 271 | 	spin_unlock_irqrestore(&wb->mb_lock, flags);
 272 | 
 273 | 	return retval;
 274 | }
 275 | 
 276 | /*----------------------------------------------------------------------------*/
 277 | 
 278 | void cursor_init(struct wb_device *wb)
 279 | {
 280 | 	wb->cursor = wb->current_seg->start_idx;
 281 | 	wb->current_seg->length = 0;
 282 | }
 283 | 
 284 | /*
 285 |  * Advance the cursor and return the old cursor.
 286 |  * After returned, nr_inflight_ios is incremented to wait for this write to complete.
 287 |  */
 288 | static u32 advance_cursor(struct wb_device *wb)
 289 | {
 290 | 	u32 old;
 291 | 	if (wb->cursor == wb->nr_caches)
 292 | 		wb->cursor = 0;
 293 | 	old = wb->cursor;
 294 | 	wb->cursor++;
 295 | 	wb->current_seg->length++;
 296 | 	BUG_ON(wb->current_seg->length > wb->nr_caches_inseg);
 297 | 	atomic_inc(&wb->current_seg->nr_inflight_ios);
 298 | 	return old;
 299 | }
 300 | 
 301 | static bool needs_queue_seg(struct wb_device *wb)
 302 | {
 303 | 	bool rambuf_no_space = !mb_idx_inseg(wb, wb->cursor);
 304 | 	return rambuf_no_space;
 305 | }
 306 | 
 307 | /*----------------------------------------------------------------------------*/
 308 | 
 309 | static void copy_barrier_requests(struct rambuffer *rambuf, struct wb_device *wb)
 310 | {
 311 | 	bio_list_init(&rambuf->barrier_ios);
 312 | 	bio_list_merge(&rambuf->barrier_ios, &wb->barrier_ios);
 313 | 	bio_list_init(&wb->barrier_ios);
 314 | }
 315 | 
 316 | static void prepare_rambuffer(struct rambuffer *rambuf,
 317 | 			      struct wb_device *wb,
 318 | 			      struct segment_header *seg)
 319 | {
 320 | 	rambuf->seg = seg;
 321 | 	prepare_segment_header_device(rambuf->data, wb, seg);
 322 | 	copy_barrier_requests(rambuf, wb);
 323 | }
 324 | 
 325 | static void init_rambuffer(struct wb_device *wb)
 326 | {
 327 | 	memset(wb->current_rambuf->data, 0, 1 << 12);
 328 | }
 329 | 
 330 | /*
 331 |  * Acquire a new RAM buffer for the new segment.
 332 |  */
 333 | static void __acquire_new_rambuffer(struct wb_device *wb, u64 id)
 334 | {
 335 | 	wait_for_flushing(wb, SUB_ID(id, NR_RAMBUF_POOL));
 336 | 
 337 | 	wb->current_rambuf = get_rambuffer_by_id(wb, id);
 338 | 
 339 | 	init_rambuffer(wb);
 340 | }
 341 | 
 342 | static void __acquire_new_seg(struct wb_device *wb, u64 id)
 343 | {
 344 | 	struct segment_header *new_seg = get_segment_header_by_id(wb, id);
 345 | 
 346 | 	/*
 347 | 	 * We wait for all requests to the new segment is consumed.
 348 | 	 * Mutex taken guarantees that no new I/O to this segment is coming in.
 349 | 	 */
 350 | 	wait_event(wb->inflight_ios_wq,
 351 | 		!atomic_read(&new_seg->nr_inflight_ios));
 352 | 
 353 | 	wait_for_writeback(wb, SUB_ID(id, wb->nr_segments));
 354 | 	if (count_dirty_caches_remained(new_seg)) {
 355 | 		DMERR("%u dirty caches remained. id:%llu",
 356 | 		      count_dirty_caches_remained(new_seg), id);
 357 | 		BUG();
 358 | 	}
 359 | 	discard_caches_inseg(wb, new_seg);
 360 | 
 361 | 	/*
 362 | 	 * We mustn't set new id to the new segment before
 363 | 	 * all wait_* events are done since they uses those id for waiting.
 364 | 	 */
 365 | 	new_seg->id = id;
 366 | 	wb->current_seg = new_seg;
 367 | }
 368 | 
 369 | /*
 370 |  * Acquire the new segment and RAM buffer for the following writes.
 371 |  * Guarantees all dirty caches in the segments are written back and
 372 |  * all metablocks in it are invalidated (Linked to null head).
 373 |  */
 374 | void acquire_new_seg(struct wb_device *wb, u64 id)
 375 | {
 376 | 	__acquire_new_rambuffer(wb, id);
 377 | 	__acquire_new_seg(wb, id);
 378 | }
 379 | 
 380 | static void prepare_new_seg(struct wb_device *wb)
 381 | {
 382 | 	u64 next_id = wb->current_seg->id + 1;
 383 | 	acquire_new_seg(wb, next_id);
 384 | 	cursor_init(wb);
 385 | }
 386 | 
 387 | /*----------------------------------------------------------------------------*/
 388 | 
 389 | static void queue_flush_job(struct wb_device *wb)
 390 | {
 391 | 	wait_event(wb->inflight_ios_wq, !atomic_read(&wb->current_seg->nr_inflight_ios));
 392 | 
 393 | 	prepare_rambuffer(wb->current_rambuf, wb, wb->current_seg);
 394 | 
 395 | 	smp_wmb();
 396 | 	atomic64_inc(&wb->last_queued_segment_id);
 397 | 	wake_up_process(wb->flush_daemon);
 398 | }
 399 | 
 400 | static void queue_current_buffer(struct wb_device *wb)
 401 | {
 402 | 	queue_flush_job(wb);
 403 | 	prepare_new_seg(wb);
 404 | }
 405 | 
 406 | /*
 407 |  * queue_current_buffer if the RAM buffer can't make space any more.
 408 |  */
 409 | static void might_queue_current_buffer(struct wb_device *wb)
 410 | {
 411 | 	if (needs_queue_seg(wb)) {
 412 | 		update_nr_empty_segs(wb);
 413 | 		queue_current_buffer(wb);
 414 | 	}
 415 | }
 416 | 
 417 | /*
 418 |  * Flush out all the transient data at a moment but _NOT_ persistently.
 419 |  */
 420 | void flush_current_buffer(struct wb_device *wb)
 421 | {
 422 | 	struct segment_header *old_seg;
 423 | 
 424 | 	mutex_lock(&wb->io_lock);
 425 | 	old_seg = wb->current_seg;
 426 | 
 427 | 	queue_current_buffer(wb);
 428 | 	mutex_unlock(&wb->io_lock);
 429 | 
 430 | 	wait_for_flushing(wb, old_seg->id);
 431 | }
 432 | 
 433 | /*----------------------------------------------------------------------------*/
 434 | 
 435 | static void inc_stat(struct wb_device *wb,
 436 | 		     int rw, bool found, bool on_buffer, bool fullsize)
 437 | {
 438 | 	atomic64_t *v;
 439 | 
 440 | 	int i = 0;
 441 | 	if (rw)
 442 | 		i |= (1 << WB_STAT_WRITE);
 443 | 	if (found)
 444 | 		i |= (1 << WB_STAT_HIT);
 445 | 	if (on_buffer)
 446 | 		i |= (1 << WB_STAT_ON_BUFFER);
 447 | 	if (fullsize)
 448 | 		i |= (1 << WB_STAT_FULLSIZE);
 449 | 
 450 | 	v = &wb->stat[i];
 451 | 	atomic64_inc(v);
 452 | }
 453 | 
 454 | static void clear_stat(struct wb_device *wb)
 455 | {
 456 | 	size_t i;
 457 | 	for (i = 0; i < STATLEN; i++) {
 458 | 		atomic64_t *v = &wb->stat[i];
 459 | 		atomic64_set(v, 0);
 460 | 	}
 461 | 	atomic64_set(&wb->count_non_full_flushed, 0);
 462 | }
 463 | 
 464 | /*----------------------------------------------------------------------------*/
 465 | 
 466 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
 467 | #define bv_vec struct bio_vec
 468 | #define bv_page(vec) vec.bv_page
 469 | #define bv_offset(vec) vec.bv_offset
 470 | #define bv_len(vec) vec.bv_len
 471 | #define bv_it struct bvec_iter
 472 | #else
 473 | #define bv_vec struct bio_vec *
 474 | #define bv_page(vec) vec->bv_page
 475 | #define bv_offset(vec) vec->bv_offset
 476 | #define bv_len(vec) vec->bv_len
 477 | #define bv_it int
 478 | #endif
 479 | 
 480 | /*
 481 |  * Incoming bio may have multiple bio vecs as a result bvec merging.
 482 |  * We shouldn't use bio_data directly to access to whole payload but
 483 |  * should iterate over the vector.
 484 |  */
 485 | static void copy_bio_payload(void *buf, struct bio *bio)
 486 | {
 487 | 	size_t sum = 0;
 488 | 	bv_vec vec;
 489 | 	bv_it it;
 490 | 	bio_for_each_segment(vec, bio, it) {
 491 | 		void *dst = kmap_atomic(bv_page(vec));
 492 | 		size_t l = bv_len(vec);
 493 | 		memcpy(buf, dst + bv_offset(vec), l);
 494 | 		kunmap_atomic(dst);
 495 | 		buf += l;
 496 | 		sum += l;
 497 | 	}
 498 | 	ASSERT(sum == (bio_sectors(bio) << 9));
 499 | }
 500 | 
 501 | /*
 502 |  * Copy 512B buffer data to bio payload's i-th 512B area.
 503 |  */
 504 | static void __copy_to_bio_payload(struct bio *bio, void *buf, u8 i)
 505 | {
 506 | 	size_t head = 0;
 507 | 	size_t tail = head;
 508 | 
 509 | 	bv_vec vec;
 510 | 	bv_it it;
 511 | 	bio_for_each_segment(vec, bio, it) {
 512 | 		size_t l = bv_len(vec);
 513 | 		tail += l;
 514 | 		if ((i << 9) < tail) {
 515 | 			void *dst = kmap_atomic(bv_page(vec));
 516 | 			size_t offset = (i << 9) - head;
 517 | 			BUG_ON((l - offset) < (1 << 9));
 518 | 			memcpy(dst + bv_offset(vec) + offset, buf, 1 << 9);
 519 | 			kunmap_atomic(dst);
 520 | 			return;
 521 | 		}
 522 | 		head += l;
 523 | 	}
 524 | 	BUG();
 525 | }
 526 | 
 527 | /*
 528 |  * Copy 4KB buffer to bio payload with care to bio offset and copy bits.
 529 |  */
 530 | static void copy_to_bio_payload(struct bio *bio, void *buf, u8 copy_bits)
 531 | {
 532 | 	u8 offset = bio_calc_offset(bio);
 533 | 	u8 i;
 534 | 	for (i = 0; i < bio_sectors(bio); i++) {
 535 | 		u8 i_offset = i + offset;
 536 | 		if (copy_bits & (1 << i_offset))
 537 | 			__copy_to_bio_payload(bio, buf + (i_offset << 9), i);
 538 | 	}
 539 | }
 540 | 
 541 | /*----------------------------------------------------------------------------*/
 542 | 
 543 | struct lookup_result {
 544 | 	struct ht_head *head; /* Lookup head used */
 545 | 	struct lookup_key key; /* Lookup key used */
 546 | 
 547 | 	struct segment_header *found_seg;
 548 | 	struct metablock *found_mb;
 549 | 
 550 | 	bool found; /* Cache hit? */
 551 | 	bool on_buffer; /* Is the metablock found on the RAM buffer? */
 552 | };
 553 | 
 554 | /*
 555 |  * Lookup a bio relevant cache data.
 556 |  * In case of cache hit, nr_inflight_ios is incremented.
 557 |  */
 558 | static void cache_lookup(struct wb_device *wb, struct bio *bio, struct lookup_result *res)
 559 | {
 560 | 	res->key = (struct lookup_key) {
 561 | 		.sector = calc_cache_alignment(bi_sector(bio)),
 562 | 	};
 563 | 	res->head = ht_get_head(wb, &res->key);
 564 | 
 565 | 	res->found_mb = ht_lookup(wb, res->head, &res->key);
 566 | 	if (res->found_mb) {
 567 | 		res->found_seg = mb_to_seg(wb, res->found_mb);
 568 | 		atomic_inc(&res->found_seg->nr_inflight_ios);
 569 | 	}
 570 | 
 571 | 	res->found = (res->found_mb != NULL);
 572 | 
 573 | 	res->on_buffer = false;
 574 | 	if (res->found)
 575 | 		res->on_buffer = is_on_buffer(wb, res->found_mb->idx);
 576 | 
 577 | 	inc_stat(wb, bio_is_write(bio), res->found, res->on_buffer, bio_is_fullsize(bio));
 578 | }
 579 | 
 580 | static void dec_inflight_ios(struct wb_device *wb, struct segment_header *seg)
 581 | {
 582 | 	if (atomic_dec_and_test(&seg->nr_inflight_ios))
 583 | 		wake_up_active_wq(&wb->inflight_ios_wq);
 584 | }
 585 | 
 586 | /*----------------------------------------------------------------------------*/
 587 | 
 588 | static u8 to_mask(u8 offset, u8 count)
 589 | {
 590 | 	u8 i;
 591 | 	u8 result = 0;
 592 | 	if (count == 8) {
 593 | 		result = 255;
 594 | 	} else {
 595 | 		for (i = 0; i < count; i++)
 596 | 			result |= (1 << (i + offset));
 597 | 	}
 598 | 	return result;
 599 | }
 600 | 
 601 | static int fill_payload_by_backing(struct wb_device *wb, struct bio *bio)
 602 | {
 603 | 	struct dm_io_request io_req;
 604 | 	struct dm_io_region region;
 605 | 
 606 | 	sector_t start = bi_sector(bio);
 607 | 	u8 offset = calc_offset(start);
 608 | 	u8 len = bio_sectors(bio);
 609 | 	u8 copy_bits = to_mask(offset, len);
 610 | 
 611 | 	int err = 0;
 612 | 	void *buf = mempool_alloc(wb->buf_8_pool, GFP_NOIO);
 613 | 	if (!buf)
 614 | 		return -ENOMEM;
 615 | 
 616 | 	io_req = (struct dm_io_request) {
 617 | 		WB_IO_READ,
 618 | 		.client = wb->io_client,
 619 | 		.notify.fn = NULL,
 620 | 		.mem.type = DM_IO_KMEM,
 621 | 		.mem.ptr.addr = buf + (offset << 9),
 622 | 	};
 623 | 	region = (struct dm_io_region) {
 624 | 		.bdev = wb->backing_dev->bdev,
 625 | 		.sector = start,
 626 | 		.count = len,
 627 | 	};
 628 | 	err = wb_io(&io_req, 1, &region, NULL, true);
 629 | 	if (err)
 630 | 		goto bad;
 631 | 
 632 | 	copy_to_bio_payload(bio, buf, copy_bits);
 633 | bad:
 634 | 	mempool_free(buf, wb->buf_8_pool);
 635 | 	return err;
 636 | }
 637 | 
 638 | /*
 639 |  * Get the reference to the 4KB-aligned data in RAM buffer.
 640 |  * Since it only takes the reference caller need not to free the pointer.
 641 |  */
 642 | static void *ref_buffered_mb(struct wb_device *wb, struct metablock *mb)
 643 | {
 644 | 	sector_t offset = ((mb_idx_inseg(wb, mb->idx) + 1) << 3);
 645 | 	return wb->current_rambuf->data + (offset << 9);
 646 | }
 647 | 
 648 | /*
 649 |  * Read cache block of the mb.
 650 |  * Caller should free the returned pointer after used by mempool_alloc().
 651 |  */
 652 | static void *read_mb(struct wb_device *wb, struct segment_header *seg,
 653 | 		     struct metablock *mb, u8 data_bits)
 654 | {
 655 | 	u8 i;
 656 | 	void *result = mempool_alloc(wb->buf_8_pool, GFP_NOIO);
 657 | 	if (!result)
 658 | 		return NULL;
 659 | 
 660 | 	for (i = 0; i < 8; i++) {
 661 | 		int err = 0;
 662 | 		struct dm_io_request io_req;
 663 | 		struct dm_io_region region;
 664 | 
 665 | 		if (!(data_bits & (1 << i)))
 666 | 			continue;
 667 | 
 668 | 		io_req = (struct dm_io_request) {
 669 | 			WB_IO_READ,
 670 | 			.client = wb->io_client,
 671 | 			.notify.fn = NULL,
 672 | 			.mem.type = DM_IO_KMEM,
 673 | 			.mem.ptr.addr = result + (i << 9),
 674 | 		};
 675 | 
 676 | 		region = (struct dm_io_region) {
 677 | 			.bdev = wb->cache_dev->bdev,
 678 | 			.sector = calc_mb_start_sector(wb, seg, mb->idx) + i,
 679 | 			.count = 1,
 680 | 		};
 681 | 
 682 | 		err = wb_io(&io_req, 1, &region, NULL, true);
 683 | 		if (err) {
 684 | 			mempool_free(result, wb->buf_8_pool);
 685 | 			return NULL;
 686 | 		}
 687 | 	}
 688 | 	return result;
 689 | }
 690 | 
 691 | /*----------------------------------------------------------------------------*/
 692 | 
 693 | enum PBD_FLAG {
 694 | 	PBD_NONE = 0,
 695 | 	PBD_WILL_CACHE = 1,
 696 | 	PBD_READ_SEG = 2,
 697 | };
 698 | 
 699 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
 700 | #define PER_BIO_DATA_SIZE per_io_data_size
 701 | #else
 702 | #define PER_BIO_DATA_SIZE per_bio_data_size
 703 | #endif
 704 | struct per_bio_data {
 705 | 	enum PBD_FLAG type;
 706 | 	union {
 707 | 		u32 cell_idx;
 708 | 		struct segment_header *seg;
 709 | 	};
 710 | };
 711 | #define per_bio_data(bio) ((struct per_bio_data *) dm_per_bio_data((bio), sizeof(struct per_bio_data)))
 712 | 
 713 | /*----------------------------------------------------------------------------*/
 714 | 
 715 | #define read_cache_cell_from_node(node) rb_entry((node), struct read_cache_cell, rb_node)
 716 | 
 717 | static void read_cache_add(struct read_cache_cells *cells, struct read_cache_cell *cell)
 718 | {
 719 | 	struct rb_node **rbp, *parent;
 720 | 	rbp = &cells->rb_root.rb_node;
 721 | 	parent = NULL;
 722 | 	while (*rbp) {
 723 | 		struct read_cache_cell *parent_cell;
 724 | 		parent = *rbp;
 725 | 		parent_cell = read_cache_cell_from_node(parent);
 726 | 		if (cell->sector < parent_cell->sector)
 727 | 			rbp = &(*rbp)->rb_left;
 728 | 		else
 729 | 			rbp = &(*rbp)->rb_right;
 730 | 	}
 731 | 	rb_link_node(&cell->rb_node, parent, rbp);
 732 | 	rb_insert_color(&cell->rb_node, &cells->rb_root);
 733 | }
 734 | 
 735 | static struct read_cache_cell *lookup_read_cache_cell(struct read_cache_cells *cells, sector_t sector)
 736 | {
 737 | 	struct rb_node **rbp, *parent;
 738 | 	rbp = &cells->rb_root.rb_node;
 739 | 	parent = NULL;
 740 | 	while (*rbp) {
 741 | 		struct read_cache_cell *parent_cell;
 742 | 		parent = *rbp;
 743 | 		parent_cell = read_cache_cell_from_node(parent);
 744 | 		if (parent_cell->sector == sector)
 745 | 			return parent_cell;
 746 | 
 747 | 		if (sector < parent_cell->sector)
 748 | 			rbp = &(*rbp)->rb_left;
 749 | 		else
 750 | 			rbp = &(*rbp)->rb_right;
 751 | 	}
 752 | 	return NULL;
 753 | }
 754 | 
 755 | /*
 756 |  * Cancel all cells in [cursor, cursor + seqcount).
 757 |  */
 758 | static void read_cache_cancel_seq_cells(struct read_cache_cells *cells)
 759 | {
 760 | 	u32 i;
 761 | 	u32 last = cells->cursor + cells->seqcount;
 762 | 	if (last > cells->size)
 763 | 		last = cells->size;
 764 | 	for (i = cells->cursor; i < last; i++) {
 765 | 		struct read_cache_cell *cell = cells->array + i;
 766 | 		atomic_set(&cell->cancelled, 1);
 767 | 	}
 768 | }
 769 | 
 770 | /*
 771 |  * Track the forefront read address and cancel cells in case of over threshold.
 772 |  * If the cell is cancelled foreground, we can save the memory copy in the background.
 773 |  */
 774 | static void read_cache_cancel_foreground(struct read_cache_cells *cells,
 775 | 					 struct read_cache_cell *new_cell)
 776 | {
 777 | 	if (new_cell->sector == (cells->last_sector + 8))
 778 | 		cells->seqcount++;
 779 | 	else {
 780 | 		cells->seqcount = 1;
 781 | 		cells->over_threshold = false;
 782 | 	}
 783 | 	cells->last_sector = new_cell->sector;
 784 | 
 785 | 	if (cells->seqcount > cells->threshold) {
 786 | 		if (cells->over_threshold)
 787 | 			atomic_set(&new_cell->cancelled, 1);
 788 | 		else {
 789 | 			cells->over_threshold = true;
 790 | 			read_cache_cancel_seq_cells(cells);
 791 | 		}
 792 | 	}
 793 | }
 794 | 
 795 | static bool do_reserve_read_cache_cell(struct read_cache_cells *cells, struct bio *bio)
 796 | {
 797 | 	struct read_cache_cell *found, *new_cell;
 798 | 
 799 | 	ASSERT(cells->threshold > 0);
 800 | 	if (!cells->cursor)
 801 | 		return false;
 802 | 
 803 | 	/*
 804 | 	 * We don't need to reserve the same address twice
 805 | 	 * because it's either unchanged or invalidated.
 806 | 	 */
 807 | 	found = lookup_read_cache_cell(cells, bi_sector(bio));
 808 | 	if (found)
 809 | 		return false;
 810 | 
 811 | 	cells->cursor--;
 812 | 	new_cell = cells->array + cells->cursor;
 813 | 	new_cell->sector = bi_sector(bio);
 814 | 	read_cache_add(cells, new_cell);
 815 | 
 816 | 	/* Cancel the new_cell if needed */
 817 | 	read_cache_cancel_foreground(cells, new_cell);
 818 | 
 819 | 	return true;
 820 | }
 821 | 
 822 | static bool reserve_read_cache_cell(struct wb_device *wb, struct bio *bio)
 823 | {
 824 | 	struct per_bio_data *pbd;
 825 | 	struct read_cache_cells *cells = wb->read_cache_cells;
 826 | 	bool reserved;
 827 | 
 828 | 	if (!read_once(wb->read_cache_threshold))
 829 | 		return false;
 830 | 
 831 | 	/*
 832 | 	 * We only cache 4KB read data for following reasons:
 833 | 	 * 1) Caching partial data (< 4KB) is likely meaningless.
 834 | 	 * 2) Caching partial data makes the read-caching mechanism very hard.
 835 | 	 */
 836 | 	if (!bio_is_fullsize(bio))
 837 | 		return false;
 838 | 
 839 | 	mutex_lock(&cells->lock);
 840 | 	reserved = do_reserve_read_cache_cell(cells, bio);
 841 | 	mutex_unlock(&cells->lock);
 842 | 
 843 | 	if (!reserved)
 844 | 		return false;
 845 | 
 846 | 	pbd = per_bio_data(bio);
 847 | 	pbd->type = PBD_WILL_CACHE;
 848 | 	pbd->cell_idx = cells->cursor;
 849 | 
 850 | 	return true;
 851 | }
 852 | 
 853 | static void might_cancel_read_cache_cell(struct wb_device *wb, struct bio *bio)
 854 | {
 855 | 	struct read_cache_cell *found;
 856 | 	struct read_cache_cells *cells = wb->read_cache_cells;
 857 | 
 858 | 	mutex_lock(&cells->lock);
 859 | 	found = lookup_read_cache_cell(cells, calc_cache_alignment(bi_sector(bio)));
 860 | 	mutex_unlock(&cells->lock);
 861 | 
 862 | 	if (found)
 863 | 		atomic_set(&found->cancelled, 1);
 864 | }
 865 | 
 866 | static void read_cache_cell_copy_data(struct wb_device *wb, struct bio *bio, unsigned long error)
 867 | {
 868 | 	struct per_bio_data *pbd = per_bio_data(bio);
 869 | 	struct read_cache_cells *cells = wb->read_cache_cells;
 870 | 	struct read_cache_cell *cell = cells->array + pbd->cell_idx;
 871 | 
 872 | 	ASSERT(pbd->type == PBD_WILL_CACHE);
 873 | 
 874 | 	/* Data can be broken. So don't stage. */
 875 | 	if (error)
 876 | 		atomic_set(&cell->cancelled, 1);
 877 | 
 878 | 	/*
 879 | 	 * We can omit copying if the cell is cancelled but
 880 | 	 * copying for a non-cancelled cell isn't problematic.
 881 | 	 */
 882 | 	if (!atomic_read(&cell->cancelled))
 883 | 		copy_bio_payload(cell->data, bio);
 884 | 
 885 | 	if (atomic_dec_and_test(&cells->ack_count))
 886 | 		queue_work(cells->wq, &wb->read_cache_work);
 887 | }
 888 | 
 889 | /*
 890 |  * Get a read cache cell through simplified write path if the cell data isn't stale.
 891 |  */
 892 | static void inject_read_cache(struct wb_device *wb, struct read_cache_cell *cell)
 893 | {
 894 | 	struct metablock *mb;
 895 | 	u32 _mb_idx_inseg;
 896 | 	struct segment_header *seg;
 897 | 
 898 | 	struct lookup_key key = {
 899 | 		.sector = cell->sector,
 900 | 	};
 901 | 	struct ht_head *head = ht_get_head(wb, &key);
 902 | 
 903 | 	mutex_lock(&wb->io_lock);
 904 | 	/*
 905 | 	 * if might_cancel_read_cache_cell() on the foreground
 906 | 	 * cancelled this cell, the data is now stale.
 907 | 	 */
 908 | 	if (atomic_read(&cell->cancelled)) {
 909 | 		mutex_unlock(&wb->io_lock);
 910 | 		return;
 911 | 	}
 912 | 
 913 | 	might_queue_current_buffer(wb);
 914 | 
 915 | 	seg = wb->current_seg;
 916 | 	_mb_idx_inseg = mb_idx_inseg(wb, advance_cursor(wb));
 917 | 
 918 | 	/*
 919 | 	 * We should copy the cell data into the rambuf with lock held
 920 | 	 * otherwise subsequent write data may be written first and then overwritten by
 921 | 	 * the old data in the cell.
 922 | 	 */
 923 | 	memcpy(wb->current_rambuf->data + ((_mb_idx_inseg + 1) << 12), cell->data, 1 << 12);
 924 | 
 925 | 	mb = seg->mb_array + _mb_idx_inseg;
 926 | 	ASSERT(!mb->dirtiness.is_dirty);
 927 | 	mb->dirtiness.data_bits = 255;
 928 | 
 929 | 	ht_register(wb, head, mb, &key);
 930 | 
 931 | 	mutex_unlock(&wb->io_lock);
 932 | 
 933 | 	dec_inflight_ios(wb, seg);
 934 | }
 935 | 
 936 | static void free_read_cache_cell_data(struct read_cache_cells *cells)
 937 | {
 938 | 	u32 i;
 939 | 	for (i = 0; i < cells->size; i++) {
 940 | 		struct read_cache_cell *cell = cells->array + i;
 941 | 		vfree(cell->data);
 942 | 	}
 943 | }
 944 | 
 945 | static struct read_cache_cells *alloc_read_cache_cells(struct wb_device *wb, u32 n)
 946 | {
 947 | 	struct read_cache_cells *cells;
 948 | 	u32 i;
 949 | 	cells = kmalloc(sizeof(struct read_cache_cells), GFP_KERNEL);
 950 | 	if (!cells)
 951 | 		return NULL;
 952 | 
 953 | 	mutex_init(&cells->lock);
 954 | 	cells->size = n;
 955 | 	cells->threshold = UINT_MAX; /* Default: every read will be cached */
 956 | 	cells->last_sector = ~0;
 957 | 	cells->seqcount = 0;
 958 | 	cells->over_threshold = false;
 959 | 	cells->array = kmalloc(sizeof(struct read_cache_cell) * n, GFP_KERNEL);
 960 | 	if (!cells->array)
 961 | 		goto bad_cells_array;
 962 | 
 963 | 	for (i = 0; i < cells->size; i++) {
 964 | 		struct read_cache_cell *cell = cells->array + i;
 965 | 		cell->data = vmalloc(1 << 12);
 966 | 		if (!cell->data) {
 967 | 			u32 j;
 968 | 			for (j = 0; j < i; j++) {
 969 | 				cell = cells->array + j;
 970 | 				vfree(cell->data);
 971 | 			}
 972 | 			goto bad_cell_data;
 973 | 		}
 974 | 	}
 975 | 
 976 | 	cells->wq = create_singlethread_workqueue("dmwb_read_cache");
 977 | 	if (!cells->wq)
 978 | 		goto bad_wq;
 979 | 
 980 | 	return cells;
 981 | 
 982 | bad_wq:
 983 | 	free_read_cache_cell_data(cells);
 984 | bad_cell_data:
 985 | 	kfree(cells->array);
 986 | bad_cells_array:
 987 | 	kfree(cells);
 988 | 	return NULL;
 989 | }
 990 | 
 991 | static void free_read_cache_cells(struct wb_device *wb)
 992 | {
 993 | 	struct read_cache_cells *cells = wb->read_cache_cells;
 994 | 	destroy_workqueue(cells->wq); /* This drains wq. So, must precede the others */
 995 | 	free_read_cache_cell_data(cells);
 996 | 	kfree(cells->array);
 997 | 	kfree(cells);
 998 | }
 999 | 
1000 | static void reinit_read_cache_cells(struct read_cache_cells *cells, u32 new_threshold)
1001 | {
1002 | 	u32 i;
1003 | 
1004 | 	cells->rb_root = RB_ROOT;
1005 | 	cells->cursor = cells->size;
1006 | 	atomic_set(&cells->ack_count, cells->size);
1007 | 	for (i = 0; i < cells->size; i++) {
1008 | 		struct read_cache_cell *cell = cells->array + i;
1009 | 		atomic_set(&cell->cancelled, 0);
1010 | 	}
1011 | 	if (new_threshold && (new_threshold != cells->threshold)) {
1012 | 		cells->threshold = new_threshold;
1013 | 		cells->over_threshold = false;
1014 | 	}
1015 | }
1016 | 
1017 | /*
1018 |  * Cancel cells [first, last)
1019 |  */
1020 | static void visit_and_cancel_cells(struct rb_node *first, struct rb_node *last)
1021 | {
1022 | 	struct rb_node *rbp = first;
1023 | 	while (rbp != last) {
1024 | 		struct read_cache_cell *cell = read_cache_cell_from_node(rbp);
1025 | 		atomic_set(&cell->cancelled, 1);
1026 | 		rbp = rb_next(rbp);
1027 | 	}
1028 | }
1029 | 
1030 | /*
1031 |  * Find out sequence from cells and cancel them if larger than threshold.
1032 |  */
1033 | static void read_cache_cancel_background(struct read_cache_cells *cells)
1034 | {
1035 | 	struct rb_node *rbp = rb_first(&cells->rb_root);
1036 | 	struct rb_node *seqhead = rbp;
1037 | 	sector_t last_sector = ~0;
1038 | 	u32 seqcount = 0;
1039 | 
1040 | 	while (rbp) {
1041 | 		struct read_cache_cell *cell = read_cache_cell_from_node(rbp);
1042 | 		if (cell->sector == (last_sector + 8))
1043 | 			seqcount++;
1044 | 		else {
1045 | 			if (seqcount > cells->threshold)
1046 | 				visit_and_cancel_cells(seqhead, rbp);
1047 | 			seqcount = 1;
1048 | 			seqhead = rbp;
1049 | 		}
1050 | 		last_sector = cell->sector;
1051 | 		rbp = rb_next(rbp);
1052 | 	}
1053 | 	if (seqcount > cells->threshold)
1054 | 		visit_and_cancel_cells(seqhead, rbp);
1055 | }
1056 | 
1057 | static void read_cache_proc(struct work_struct *work)
1058 | {
1059 | 	struct wb_device *wb = container_of(work, struct wb_device, read_cache_work);
1060 | 	struct read_cache_cells *cells = wb->read_cache_cells;
1061 | 	u32 i;
1062 | 
1063 | 	read_cache_cancel_background(cells);
1064 | 
1065 | 	for (i = 0; i < cells->size; i++) {
1066 | 		struct read_cache_cell *cell = cells->array + i;
1067 | 		inject_read_cache(wb, cell);
1068 | 	}
1069 | 
1070 | 	mutex_lock(&cells->lock);
1071 | 	reinit_read_cache_cells(cells, read_once(wb->read_cache_threshold));
1072 | 	mutex_unlock(&cells->lock);
1073 | }
1074 | 
1075 | static int init_read_cache_cells(struct wb_device *wb)
1076 | {
1077 | 	struct read_cache_cells *cells;
1078 | 	INIT_WORK(&wb->read_cache_work, read_cache_proc);
1079 | 	cells = alloc_read_cache_cells(wb, wb->nr_read_cache_cells);
1080 | 	if (!cells)
1081 | 		return -ENOMEM;
1082 | 	wb->read_cache_cells = cells;
1083 | 	reinit_read_cache_cells(cells, wb->read_cache_threshold);
1084 | 	return 0;
1085 | }
1086 | 
1087 | /*----------------------------------------------------------------------------*/
1088 | 
1089 | static void initialize_write_io(struct write_io *wio, struct bio *bio)
1090 | {
1091 | 	u8 offset = bio_calc_offset(bio);
1092 | 	sector_t count = bio_sectors(bio);
1093 | 	copy_bio_payload(wio->data + (offset << 9), bio);
1094 | 	wio->data_bits = to_mask(offset, count);
1095 | }
1096 | 
1097 | static void memcpy_masked(void *to, u8 protect_bits, void *from, u8 copy_bits)
1098 | {
1099 | 	u8 i;
1100 | 	for (i = 0; i < 8; i++) {
1101 | 		bool will_copy = copy_bits & (1 << i);
1102 | 		bool protected = protect_bits & (1 << i);
1103 | 		if (will_copy && (!protected)) {
1104 | 			size_t offset = (i << 9);
1105 | 			memcpy(to + offset, from + offset, 1 << 9);
1106 | 		}
1107 | 	}
1108 | }
1109 | 
1110 | int prepare_overwrite(struct wb_device *wb, struct segment_header *seg, struct metablock *old_mb, struct write_io* wio, u8 overwrite_bits)
1111 | {
1112 | 	struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, old_mb);
1113 | 
1114 | 	bool needs_merge_prev_cache = !(overwrite_bits == 255) || !(dirtiness.data_bits == 255);
1115 | 
1116 | 	if (!dirtiness.is_dirty)
1117 | 		needs_merge_prev_cache = false;
1118 | 
1119 | 	if (overwrite_bits == 255)
1120 | 		needs_merge_prev_cache = false;
1121 | 
1122 | 	if (unlikely(needs_merge_prev_cache)) {
1123 | 		void *buf;
1124 | 
1125 | 		wait_for_flushing(wb, seg->id);
1126 | 		ASSERT(dirtiness.is_dirty);
1127 | 
1128 | 		buf = read_mb(wb, seg, old_mb, dirtiness.data_bits);
1129 | 		if (!buf)
1130 | 			return -EIO;
1131 | 
1132 | 		/* newer data should be prioritized */
1133 | 		memcpy_masked(wio->data, wio->data_bits, buf, dirtiness.data_bits);
1134 | 		wio->data_bits |= dirtiness.data_bits;
1135 | 		mempool_free(buf, wb->buf_8_pool);
1136 | 	}
1137 | 
1138 | 	if (mark_clean_mb(wb, old_mb))
1139 | 		dec_nr_dirty_caches(wb);
1140 | 
1141 | 	ht_del(wb, old_mb);
1142 | 
1143 | 	return 0;
1144 | }
1145 | 
1146 | /*
1147 |  * Get a new place to write.
1148 |  */
1149 | static struct metablock *prepare_new_write_pos(struct wb_device *wb)
1150 | {
1151 | 	struct metablock *ret = wb->current_seg->mb_array + mb_idx_inseg(wb, advance_cursor(wb));
1152 | 	ASSERT(!ret->dirtiness.is_dirty);
1153 | 	ret->dirtiness.data_bits = 0;
1154 | 	return ret;
1155 | }
1156 | 
1157 | static void write_on_rambuffer(struct wb_device *wb, struct metablock *write_pos, struct write_io *wio)
1158 | {
1159 | 	size_t mb_offset = (mb_idx_inseg(wb, write_pos->idx) + 1) << 12;
1160 | 	void *mb_data = wb->current_rambuf->data + mb_offset;
1161 | 	if (wio->data_bits == 255)
1162 | 		memcpy(mb_data, wio->data, 1 << 12);
1163 | 	else
1164 | 		memcpy_masked(mb_data, 0, wio->data, wio->data_bits);
1165 | }
1166 | 
1167 | static int do_process_write(struct wb_device *wb, struct bio *bio)
1168 | {
1169 | 	int err = 0;
1170 | 
1171 | 	struct metablock *write_pos = NULL;
1172 | 	struct lookup_result res;
1173 | 
1174 | 	struct write_io wio;
1175 | 	wio.data = mempool_alloc(wb->buf_8_pool, GFP_NOIO);
1176 | 	if (!wio.data)
1177 | 		return -ENOMEM;
1178 | 	initialize_write_io(&wio, bio);
1179 | 
1180 | 	mutex_lock(&wb->io_lock);
1181 | 
1182 | 	cache_lookup(wb, bio, &res);
1183 | 	if (res.found) {
1184 | 		if (unlikely(res.on_buffer)) {
1185 | 			write_pos = res.found_mb;
1186 | 			goto do_write;
1187 | 		} else {
1188 | 			err = prepare_overwrite(wb, res.found_seg, res.found_mb, &wio, wio.data_bits);
1189 | 			dec_inflight_ios(wb, res.found_seg);
1190 | 			if (err)
1191 | 				goto out;
1192 | 		}
1193 | 	}
1194 | 	might_cancel_read_cache_cell(wb, bio);
1195 | 
1196 | 	might_queue_current_buffer(wb);
1197 | 
1198 | 	write_pos = prepare_new_write_pos(wb);
1199 | 
1200 | do_write:
1201 | 	ASSERT(write_pos);
1202 | 	write_on_rambuffer(wb, write_pos, &wio);
1203 | 
1204 | 	if (taint_mb(wb, write_pos, wio.data_bits))
1205 | 		inc_nr_dirty_caches(wb);
1206 | 
1207 | 	ht_register(wb, res.head, write_pos, &res.key);
1208 | 
1209 | out:
1210 | 	mutex_unlock(&wb->io_lock);
1211 | 	mempool_free(wio.data, wb->buf_8_pool);
1212 | 	return err;
1213 | }
1214 | 
1215 | static int complete_process_write(struct wb_device *wb, struct bio *bio)
1216 | {
1217 | 	dec_inflight_ios(wb, wb->current_seg);
1218 | 
1219 | 	/*
1220 | 	 * bio with FUA flag has data.
1221 | 	 * We first handle it as a normal write bio and then as a barrier bio.
1222 | 	 */
1223 | 	if (bio_is_fua(bio)) {
1224 | 		queue_barrier_io(wb, bio);
1225 | 		return DM_MAPIO_SUBMITTED;
1226 | 	}
1227 | 
1228 | 	bio_io_success_compat(bio);
1229 | 	return DM_MAPIO_SUBMITTED;
1230 | }
1231 | 
1232 | /*
1233 |  * (Locking) Dirtiness of a metablock
1234 |  * ----------------------------------
1235 |  * A cache data is placed either on RAM buffer or SSD if it was flushed.
1236 |  * To make locking easy, we simplify the rule for the dirtiness of a cache data.
1237 |  * 1) If the data is on the RAM buffer, the dirtiness only "increases".
1238 |  * 2) If the data is, on the other hand, on the SSD after flushed the dirtiness
1239 |  *    only "decreases".
1240 |  *
1241 |  * These simple rules can remove the possibility of dirtiness fluctuate on the
1242 |  * RAM buffer.
1243 |  */
1244 | 
1245 | /*
1246 |  * (Locking) Refcount (in_flight_*)
1247 |  * --------------------------------
1248 |  *
1249 |  * The basic common idea is
1250 |  * 1) Increment the refcount inside lock
1251 |  * 2) Wait for decrement outside the lock
1252 |  *
1253 |  * process_write:
1254 |  *   do_process_write:
1255 |  *     mutex_lock (to serialize write)
1256 |  *       inc in_flight_ios # refcount on the dst segment
1257 |  *     mutex_unlock
1258 |  *
1259 |  *   complete_process_write:
1260 |  *     dec in_flight_ios
1261 |  *     bio_endio(bio)
1262 |  */
1263 | static int process_write_wb(struct wb_device *wb, struct bio *bio)
1264 | {
1265 | 	int err = do_process_write(wb, bio);
1266 | 	if (err) {
1267 | 		bio_io_error(bio);
1268 | 		return DM_MAPIO_SUBMITTED;
1269 | 	}
1270 | 	return complete_process_write(wb, bio);
1271 | }
1272 | 
1273 | static int process_write_wa(struct wb_device *wb, struct bio *bio)
1274 | {
1275 | 	struct lookup_result res;
1276 | 
1277 | 	mutex_lock(&wb->io_lock);
1278 | 	cache_lookup(wb, bio, &res);
1279 | 	if (res.found) {
1280 | 		dec_inflight_ios(wb, res.found_seg);
1281 | 		ht_del(wb, res.found_mb);
1282 | 	}
1283 | 	might_cancel_read_cache_cell(wb, bio);
1284 | 	mutex_unlock(&wb->io_lock);
1285 | 
1286 | 	bio_remap(bio, wb->backing_dev, bi_sector(bio));
1287 | 	return DM_MAPIO_REMAPPED;
1288 | }
1289 | 
1290 | static int process_write(struct wb_device *wb, struct bio *bio)
1291 | {
1292 | 	return wb->write_around_mode ? process_write_wa(wb, bio) : process_write_wb(wb, bio);
1293 | }
1294 | 
1295 | struct read_backing_async_context {
1296 | 	struct wb_device *wb;
1297 | 	struct bio *bio;
1298 | };
1299 | 
1300 | static void read_backing_async_callback_onstack(unsigned long error, struct read_backing_async_context *ctx)
1301 | {
1302 | 	ASSERT(bio_is_fullsize(ctx->bio));
1303 | 
1304 | 	read_cache_cell_copy_data(ctx->wb, ctx->bio, error);
1305 | 
1306 | 	if (error)
1307 | 		bio_io_error(ctx->bio);
1308 | 	else
1309 | 		bio_io_success_compat(ctx->bio);
1310 | }
1311 | 
1312 | static void read_backing_async_callback(unsigned long error, void *context)
1313 | {
1314 | 	struct read_backing_async_context *ctx = context;
1315 | 	read_backing_async_callback_onstack(error, ctx);
1316 | 	kfree(ctx);
1317 | }
1318 | 
1319 | static int read_backing_async(struct wb_device *wb, struct bio *bio)
1320 | {
1321 | 	int err = 0;
1322 | 
1323 | 	struct dm_io_request io_req;
1324 | 	struct dm_io_region region;
1325 | 
1326 | 	struct read_backing_async_context *ctx = kmalloc(sizeof(struct read_backing_async_context), GFP_NOIO);
1327 | 	if (!ctx)
1328 | 		return -ENOMEM;
1329 | 
1330 | 	ctx->wb = wb;
1331 | 	ctx->bio = bio;
1332 | 
1333 | 	ASSERT(bio_is_fullsize(bio));
1334 | 
1335 | 	io_req = (struct dm_io_request) {
1336 | 		WB_IO_READ,
1337 | 		.client = wb->io_client,
1338 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
1339 | 		.mem.type = DM_IO_BIO,
1340 | 		.mem.ptr.bio = bio,
1341 | #else
1342 | 		.mem.type = DM_IO_BVEC,
1343 | 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
1344 | #endif
1345 | 		.notify.fn = read_backing_async_callback,
1346 | 		.notify.context = ctx
1347 | 	};
1348 | 	region = (struct dm_io_region) {
1349 | 		.bdev = wb->backing_dev->bdev,
1350 | 		.sector = bi_sector(bio),
1351 | 		.count = 8
1352 | 	};
1353 | 
1354 | 	err = wb_io(&io_req, 1, &region, NULL, false);
1355 | 	if (err)
1356 | 		kfree(ctx);
1357 | 
1358 | 	return err;
1359 | }
1360 | 
1361 | static int process_read(struct wb_device *wb, struct bio *bio)
1362 | {
1363 | 	struct lookup_result res;
1364 | 	struct dirtiness dirtiness;
1365 | 	struct per_bio_data *pbd;
1366 | 
1367 | 	bool reserved = false;
1368 | 
1369 | 	mutex_lock(&wb->io_lock);
1370 | 	cache_lookup(wb, bio, &res);
1371 | 	if (!res.found)
1372 | 		reserved = reserve_read_cache_cell(wb, bio);
1373 | 	mutex_unlock(&wb->io_lock);
1374 | 
1375 | 	if (!res.found) {
1376 | 		if (reserved) {
1377 | 			/*
1378 | 			 * Remapping clone bio to the backing store leads to
1379 | 			 * empty payload in clone_endio().
1380 | 			 * To avoid caching junk data, we need this workaround
1381 | 			 * to call dm_io() to certainly fill the bio payload.
1382 | 			 */
1383 | 			if (read_backing_async(wb, bio)) {
1384 | 				struct read_backing_async_context ctx = {
1385 | 					.wb = wb,
1386 | 					.bio = bio
1387 | 				};
1388 | 				read_backing_async_callback_onstack(1, &ctx);
1389 | 			}
1390 | 			return DM_MAPIO_SUBMITTED;
1391 | 		} else {
1392 | 			bio_remap(bio, wb->backing_dev, bi_sector(bio));
1393 | 			return DM_MAPIO_REMAPPED;
1394 | 		}
1395 | 	}
1396 | 
1397 | 	dirtiness = read_mb_dirtiness(wb, res.found_seg, res.found_mb);
1398 | 	if (unlikely(res.on_buffer)) {
1399 | 		int err = fill_payload_by_backing(wb, bio);
1400 | 		if (err)
1401 | 			goto read_buffered_mb_exit;
1402 | 
1403 | 		if (dirtiness.is_dirty)
1404 | 			copy_to_bio_payload(bio, ref_buffered_mb(wb, res.found_mb), dirtiness.data_bits);
1405 | 
1406 | read_buffered_mb_exit:
1407 | 		dec_inflight_ios(wb, res.found_seg);
1408 | 
1409 | 		if (unlikely(err))
1410 | 			bio_io_error(bio);
1411 | 		else
1412 | 			bio_io_success_compat(bio);
1413 | 
1414 | 		return DM_MAPIO_SUBMITTED;
1415 | 	}
1416 | 
1417 | 	/*
1418 | 	 * We need to wait for the segment to be flushed to the cache device.
1419 | 	 * Without this, we might read the wrong data from the cache device.
1420 | 	 */
1421 | 	wait_for_flushing(wb, res.found_seg->id);
1422 | 
1423 | 	if (unlikely(dirtiness.data_bits != 255)) {
1424 | 		int err = fill_payload_by_backing(wb, bio);
1425 | 		if (err)
1426 | 			goto read_mb_exit;
1427 | 
1428 | 		if (dirtiness.is_dirty) {
1429 | 			void *buf = read_mb(wb, res.found_seg, res.found_mb, dirtiness.data_bits);
1430 | 			if (!buf) {
1431 | 				err = -EIO;
1432 | 				goto read_mb_exit;
1433 | 			}
1434 | 			copy_to_bio_payload(bio, buf, dirtiness.data_bits);
1435 | 			mempool_free(buf, wb->buf_8_pool);
1436 | 		}
1437 | 
1438 | read_mb_exit:
1439 | 		dec_inflight_ios(wb, res.found_seg);
1440 | 
1441 | 		if (unlikely(err))
1442 | 			bio_io_error(bio);
1443 | 		else
1444 | 			bio_io_success_compat(bio);
1445 | 
1446 | 		return DM_MAPIO_SUBMITTED;
1447 | 	}
1448 | 
1449 | 	pbd = per_bio_data(bio);
1450 | 	pbd->type = PBD_READ_SEG;
1451 | 	pbd->seg = res.found_seg;
1452 | 
1453 | 	bio_remap(bio, wb->cache_dev,
1454 | 		  calc_mb_start_sector(wb, res.found_seg, res.found_mb->idx) +
1455 | 		  bio_calc_offset(bio));
1456 | 
1457 | 	return DM_MAPIO_REMAPPED;
1458 | }
1459 | 
1460 | static int process_bio(struct wb_device *wb, struct bio *bio)
1461 | {
1462 | 	return bio_is_write(bio) ? process_write(wb, bio) : process_read(wb, bio);
1463 | }
1464 | 
1465 | static int process_barrier_bio(struct wb_device *wb, struct bio *bio)
1466 | {
1467 | 	/* barrier bio doesn't have data */
1468 | 	ASSERT(bio_sectors(bio) == 0);
1469 | 	queue_barrier_io(wb, bio);
1470 | 	return DM_MAPIO_SUBMITTED;
1471 | }
1472 | 
1473 | static int writeboost_map(struct dm_target *ti, struct bio *bio)
1474 | {
1475 | 	struct wb_device *wb = ti->private;
1476 | 
1477 | 	struct per_bio_data *pbd = per_bio_data(bio);
1478 | 	pbd->type = PBD_NONE;
1479 | 
1480 | 	if (bio_is_barrier(bio))
1481 | 		return process_barrier_bio(wb, bio);
1482 | 
1483 | 	return process_bio(wb, bio);
1484 | }
1485 | 
1486 | /*
1487 |  * DM_ENDIO_DONE was actually introduced since 4.12 but used restrictedly in rq-based dm.
1488 |  * In 4.13, a patch titled "dm: change ->end_io calling convention" changed the dm internal
1489 |  * so other bio-based dm targets should follow the convension.
1490 |  * For this reason, I will start to use the DM_ENDIO_DONE at 4.13.
1491 |  */
1492 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0)
1493 | #define DM_ENDIO_DONE_COMPAT DM_ENDIO_DONE
1494 | static int writeboost_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1495 | #else
1496 | #define DM_ENDIO_DONE_COMPAT 0
1497 | static int writeboost_end_io(struct dm_target *ti, struct bio *bio, int error)
1498 | #endif
1499 | {
1500 | 	struct wb_device *wb = ti->private;
1501 | 	struct per_bio_data *pbd = per_bio_data(bio);
1502 | 
1503 | 	switch (pbd->type) {
1504 | 	case PBD_NONE:
1505 | 	case PBD_WILL_CACHE:
1506 | 		return DM_ENDIO_DONE_COMPAT;
1507 | 	case PBD_READ_SEG:
1508 | 		dec_inflight_ios(wb, pbd->seg);
1509 | 		return DM_ENDIO_DONE_COMPAT;
1510 | 	default:
1511 | 		BUG();
1512 | 	}
1513 | }
1514 | 
1515 | static int consume_essential_argv(struct wb_device *wb, struct dm_arg_set *as)
1516 | {
1517 | 	int err = 0;
1518 | 	struct dm_target *ti = wb->ti;
1519 | 
1520 | 	err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
1521 | 			    &wb->backing_dev);
1522 | 	if (err) {
1523 | 		DMERR("Failed to get backing_dev");
1524 | 		return err;
1525 | 	}
1526 | 
1527 | 	err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
1528 | 			    &wb->cache_dev);
1529 | 	if (err) {
1530 | 		DMERR("Failed to get cache_dev");
1531 | 		goto bad_get_cache;
1532 | 	}
1533 | 
1534 | 	return err;
1535 | 
1536 | bad_get_cache:
1537 | 	dm_put_device(ti, wb->backing_dev);
1538 | 	return err;
1539 | }
1540 | 
1541 | #define consume_kv(name, nr, is_static) { \
1542 | 	if (!strcasecmp(key, #name)) { \
1543 | 		if (!argc) \
1544 | 			break; \
1545 | 		if (test_bit(WB_CREATED, &wb->flags) && is_static) { \
1546 | 			DMERR("%s is a static option", #name); \
1547 | 			break; \
1548 | 		} \
1549 | 		err = dm_read_arg(_args + (nr), as, &tmp, &ti->error); \
1550 | 		if (err) { \
1551 | 			DMERR("%s", ti->error); \
1552 | 			break; \
1553 | 		} \
1554 | 		wb->name = tmp; \
1555 | 	 } }
1556 | 
1557 | static int do_consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as, unsigned argc)
1558 | {
1559 | 	int err = 0;
1560 | 	struct dm_target *ti = wb->ti;
1561 | 
1562 | 	static struct dm_arg _args[] = {
1563 | 		{0, 100, "Invalid writeback_threshold"},
1564 | 		{1, 32, "Invalid nr_max_batched_writeback"},
1565 | 		{0, 3600, "Invalid update_sb_record_interval"},
1566 | 		{0, 3600, "Invalid sync_data_interval"},
1567 | 		{0, 127, "Invalid read_cache_threshold"},
1568 | 		{0, 1, "Invalid write_around_mode"},
1569 | 		{1, 2048, "Invalid nr_read_cache_cells"},
1570 | 	};
1571 | 	unsigned tmp;
1572 | 
1573 | 	while (argc) {
1574 | 		const char *key = dm_shift_arg(as);
1575 | 		argc--;
1576 | 
1577 | 		err = -EINVAL;
1578 | 
1579 | 		consume_kv(writeback_threshold, 0, false);
1580 | 		consume_kv(nr_max_batched_writeback, 1, false);
1581 | 		consume_kv(update_sb_record_interval, 2, false);
1582 | 		consume_kv(sync_data_interval, 3, false);
1583 | 		consume_kv(read_cache_threshold, 4, false);
1584 | 		consume_kv(write_around_mode, 5, true);
1585 | 		consume_kv(nr_read_cache_cells, 6, true);
1586 | 
1587 | 		if (!err) {
1588 | 			argc--;
1589 | 		} else {
1590 | 			ti->error = "Invalid optional key";
1591 | 			break;
1592 | 		}
1593 | 	}
1594 | 
1595 | 	return err;
1596 | }
1597 | 
1598 | static int consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as)
1599 | {
1600 | 	int err = 0;
1601 | 	struct dm_target *ti = wb->ti;
1602 | 
1603 | 	static struct dm_arg _args[] = {
1604 | 		{0, 14, "Invalid optional argc"},
1605 | 	};
1606 | 	unsigned argc = 0;
1607 | 
1608 | 	if (as->argc) {
1609 | 		err = dm_read_arg_group(_args, as, &argc, &ti->error);
1610 | 		if (err) {
1611 | 			DMERR("%s", ti->error);
1612 | 			return err;
1613 | 		}
1614 | 	}
1615 | 
1616 | 	return do_consume_optional_argv(wb, as, argc);
1617 | }
1618 | 
1619 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(wb_copy_throttle,
1620 | 		"A percentage of time allocated for one-shot writeback");
1621 | 
1622 | static int init_core_struct(struct dm_target *ti)
1623 | {
1624 | 	int err = 0;
1625 | 	struct wb_device *wb;
1626 | 
1627 | 	err = dm_set_target_max_io_len(ti, 1 << 3);
1628 | 	if (err) {
1629 | 		DMERR("Failed to set max_io_len");
1630 | 		return err;
1631 | 	}
1632 | 
1633 | 	ti->num_flush_bios = 1;
1634 | 	ti->flush_supported = true;
1635 | 
1636 | 	/*
1637 | 	 * dm-writeboost does't support TRIM
1638 | 	 *
1639 | 	 * https://github.com/akiradeveloper/dm-writeboost/issues/110
1640 | 	 * - discarding backing data only violates DRAT
1641 | 	 * - strictly discarding both cache blocks and backing data is nearly impossible
1642 | 	 *   considering cache hits may occur partially.
1643 | 	 */
1644 | 	ti->num_discard_bios = 0;
1645 | 	ti->discards_supported = false;
1646 | 
1647 | 	ti->PER_BIO_DATA_SIZE = sizeof(struct per_bio_data);
1648 | 
1649 | 	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
1650 | 	if (!wb) {
1651 | 		DMERR("Failed to allocate wb");
1652 | 		return -ENOMEM;
1653 | 	}
1654 | 	ti->private = wb;
1655 | 	wb->ti = ti;
1656 | 
1657 | 	wb->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1658 | 	if (IS_ERR(wb->copier)) {
1659 | 		err = PTR_ERR(wb->copier);
1660 | 		goto bad_kcopyd_client;
1661 | 	}
1662 | 
1663 | 	wb->buf_8_cachep = kmem_cache_create("dmwb_buf_8",
1664 | 			1 << 12, 1 << 12, SLAB_RED_ZONE, NULL);
1665 | 	if (!wb->buf_8_cachep) {
1666 | 		err = -ENOMEM;
1667 | 		goto bad_buf_8_cachep;
1668 | 	}
1669 | 	wb->buf_8_pool = mempool_create_slab_pool(16, wb->buf_8_cachep);
1670 | 	if (!wb->buf_8_pool) {
1671 | 		err = -ENOMEM;
1672 | 		goto bad_buf_8_pool;
1673 | 	}
1674 | 
1675 | 	wb->io_wq = create_singlethread_workqueue("dmwb_io");
1676 | 	if (!wb->io_wq) {
1677 | 		DMERR("Failed to allocate io_wq");
1678 | 		err = -ENOMEM;
1679 | 		goto bad_io_wq;
1680 | 	}
1681 | 
1682 | 	wb->io_client = dm_io_client_create();
1683 | 	if (IS_ERR(wb->io_client)) {
1684 | 		DMERR("Failed to allocate io_client");
1685 | 		err = PTR_ERR(wb->io_client);
1686 | 		goto bad_io_client;
1687 | 	}
1688 | 
1689 | 	mutex_init(&wb->io_lock);
1690 | 	init_waitqueue_head(&wb->inflight_ios_wq);
1691 | 	spin_lock_init(&wb->mb_lock);
1692 | 	atomic64_set(&wb->nr_dirty_caches, 0);
1693 | 	clear_bit(WB_CREATED, &wb->flags);
1694 | 
1695 | 	return err;
1696 | 
1697 | bad_io_client:
1698 | 	destroy_workqueue(wb->io_wq);
1699 | bad_io_wq:
1700 | 	mempool_destroy(wb->buf_8_pool);
1701 | bad_buf_8_pool:
1702 | 	kmem_cache_destroy(wb->buf_8_cachep);
1703 | bad_buf_8_cachep:
1704 | 	dm_kcopyd_client_destroy(wb->copier);
1705 | bad_kcopyd_client:
1706 | 	kfree(wb);
1707 | 	return err;
1708 | }
1709 | 
1710 | static void free_core_struct(struct wb_device *wb)
1711 | {
1712 | 	dm_io_client_destroy(wb->io_client);
1713 | 	destroy_workqueue(wb->io_wq);
1714 | 	mempool_destroy(wb->buf_8_pool);
1715 | 	kmem_cache_destroy(wb->buf_8_cachep);
1716 | 	dm_kcopyd_client_destroy(wb->copier);
1717 | 	kfree(wb);
1718 | }
1719 | 
1720 | static int copy_ctr_args(struct wb_device *wb, int argc, const char **argv)
1721 | {
1722 | 	unsigned i;
1723 | 	const char **copy;
1724 | 
1725 | 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1726 | 	if (!copy)
1727 | 		return -ENOMEM;
1728 | 	for (i = 0; i < argc; i++) {
1729 | 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
1730 | 		if (!copy[i]) {
1731 | 			while (i--)
1732 | 				kfree(copy[i]);
1733 | 			kfree(copy);
1734 | 			return -ENOMEM;
1735 | 		}
1736 | 	}
1737 | 
1738 | 	wb->nr_ctr_args = argc;
1739 | 	wb->ctr_args = copy;
1740 | 
1741 | 	return 0;
1742 | }
1743 | 
1744 | static void free_ctr_args(struct wb_device *wb)
1745 | {
1746 | 	int i;
1747 | 	for (i = 0; i < wb->nr_ctr_args; i++)
1748 | 		kfree(wb->ctr_args[i]);
1749 | 	kfree(wb->ctr_args);
1750 | }
1751 | 
1752 | #define save_arg(name) wb->name##_saved = wb->name
1753 | #define restore_arg(name) if (wb->name##_saved) { wb->name = wb->name##_saved; }
1754 | 
1755 | /*
1756 |  * Create a writeboost device
1757 |  *
1758 |  * <essential args>
1759 |  * <#optional args> <optional args>
1760 |  * optionals are unordered lists of k-v pair.
1761 |  *
1762 |  * See doc for detail.
1763 |   */
1764 | static int writeboost_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1765 | {
1766 | 	int err = 0;
1767 | 	struct wb_device *wb;
1768 | 
1769 | 	struct dm_arg_set as;
1770 | 	as.argc = argc;
1771 | 	as.argv = argv;
1772 | 
1773 | 	err = init_core_struct(ti);
1774 | 	if (err) {
1775 | 		ti->error = "init_core_struct failed";
1776 | 		return err;
1777 | 	}
1778 | 	wb = ti->private;
1779 | 
1780 | 	err = copy_ctr_args(wb, argc - 2, (const char **)argv + 2);
1781 | 	if (err) {
1782 | 		ti->error = "copy_ctr_args failed";
1783 | 		goto bad_ctr_args;
1784 | 	}
1785 | 
1786 | 	err = consume_essential_argv(wb, &as);
1787 | 	if (err) {
1788 | 		ti->error = "consume_essential_argv failed";
1789 | 		goto bad_essential_argv;
1790 | 	}
1791 | 
1792 | 	err = consume_optional_argv(wb, &as);
1793 | 	if (err) {
1794 | 		ti->error = "consume_optional_argv failed";
1795 | 		goto bad_optional_argv;
1796 | 	}
1797 | 
1798 | 	save_arg(writeback_threshold);
1799 | 	save_arg(nr_max_batched_writeback);
1800 | 	save_arg(update_sb_record_interval);
1801 | 	save_arg(sync_data_interval);
1802 | 	save_arg(read_cache_threshold);
1803 | 	save_arg(nr_read_cache_cells);
1804 | 
1805 | 	err = resume_cache(wb);
1806 | 	if (err) {
1807 | 		ti->error = "resume_cache failed";
1808 | 		goto bad_resume_cache;
1809 | 	}
1810 | 
1811 | 	wb->nr_read_cache_cells = 2048; /* 8MB */
1812 | 	restore_arg(nr_read_cache_cells);
1813 | 	err = init_read_cache_cells(wb);
1814 | 	if (err) {
1815 | 		ti->error = "init_read_cache_cells failed";
1816 | 		goto bad_read_cache_cells;
1817 | 	}
1818 | 
1819 | 	clear_stat(wb);
1820 | 
1821 | 	set_bit(WB_CREATED, &wb->flags);
1822 | 
1823 | 	restore_arg(writeback_threshold);
1824 | 	restore_arg(nr_max_batched_writeback);
1825 | 	restore_arg(update_sb_record_interval);
1826 | 	restore_arg(sync_data_interval);
1827 | 	restore_arg(read_cache_threshold);
1828 | 
1829 | 	return err;
1830 | 
1831 | bad_read_cache_cells:
1832 | 	free_cache(wb);
1833 | bad_resume_cache:
1834 | 	dm_put_device(ti, wb->cache_dev);
1835 | 	dm_put_device(ti, wb->backing_dev);
1836 | bad_optional_argv:
1837 | bad_essential_argv:
1838 | 	free_ctr_args(wb);
1839 | bad_ctr_args:
1840 | 	free_core_struct(wb);
1841 | 	ti->private = NULL;
1842 | 
1843 | 	return err;
1844 | }
1845 | 
1846 | static void writeboost_dtr(struct dm_target *ti)
1847 | {
1848 | 	struct wb_device *wb = ti->private;
1849 | 
1850 | 	free_read_cache_cells(wb);
1851 | 
1852 | 	free_cache(wb);
1853 | 
1854 | 	dm_put_device(ti, wb->cache_dev);
1855 | 	dm_put_device(ti, wb->backing_dev);
1856 | 
1857 | 	free_ctr_args(wb);
1858 | 
1859 | 	free_core_struct(wb);
1860 | 	ti->private = NULL;
1861 | }
1862 | 
1863 | /*----------------------------------------------------------------------------*/
1864 | 
1865 | /*
1866 |  * .postsuspend is called before .dtr.
1867 |  * We flush out all the transient data and make them persistent.
1868 |  */
1869 | static void writeboost_postsuspend(struct dm_target *ti)
1870 | {
1871 | 	struct wb_device *wb = ti->private;
1872 | 	flush_current_buffer(wb);
1873 | 	dm_blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO);
1874 | }
1875 | 
1876 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
1877 | static int writeboost_message(struct dm_target *ti, unsigned argc, char **argv,
1878 | 			      char *result, unsigned maxlen)
1879 | #else
1880 | static int writeboost_message(struct dm_target *ti, unsigned argc, char **argv)
1881 | #endif
1882 | {
1883 | 	struct wb_device *wb = ti->private;
1884 | 
1885 | 	struct dm_arg_set as;
1886 | 	as.argc = argc;
1887 | 	as.argv = argv;
1888 | 
1889 | 	if (!strcasecmp(argv[0], "clear_stat")) {
1890 | 		clear_stat(wb);
1891 | 		return 0;
1892 | 	}
1893 | 
1894 | 	if (!strcasecmp(argv[0], "drop_caches")) {
1895 | 		int err = 0;
1896 | 		wb->force_drop = true;
1897 | 		err = wait_event_interruptible(wb->wait_drop_caches,
1898 | 			!atomic64_read(&wb->nr_dirty_caches));
1899 | 		wb->force_drop = false;
1900 | 		return err;
1901 | 	}
1902 | 
1903 | 	return do_consume_optional_argv(wb, &as, 2);
1904 | }
1905 | 
1906 | static int writeboost_iterate_devices(struct dm_target *ti,
1907 | 				      iterate_devices_callout_fn fn, void *data)
1908 | {
1909 | 	int r = 0;
1910 | 	struct wb_device *wb = ti->private;
1911 | 
1912 | 	r = fn(ti, wb->cache_dev, 0, dm_devsize(wb->cache_dev), data);
1913 | 	if (!r)
1914 | 		r = fn(ti, wb->backing_dev, 0, ti->len, data);
1915 | 
1916 | 	return r;
1917 | }
1918 | 
1919 | static void writeboost_io_hints(struct dm_target *ti, struct queue_limits *limits)
1920 | {
1921 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(6,12,0)
1922 | 	limits->io_opt = 4096;
1923 | #else
1924 | 	blk_limits_io_opt(limits, 4096);
1925 | #endif
1926 | }
1927 | 
1928 | static void writeboost_status(struct dm_target *ti, status_type_t type,
1929 | 			      unsigned flags, char *result, unsigned maxlen)
1930 | {
1931 | 	ssize_t sz = 0;
1932 | 	char buf[BDEVNAME_SIZE];
1933 | 	struct wb_device *wb = ti->private;
1934 | 	size_t i;
1935 | 
1936 | 	switch (type) {
1937 | 	case STATUSTYPE_INFO:
1938 | 		DMEMIT("%u %u %llu %llu %llu %llu %llu",
1939 | 		       (unsigned int)
1940 | 		       wb->cursor,
1941 | 		       (unsigned int)
1942 | 		       wb->nr_caches,
1943 | 		       (long long unsigned int)
1944 | 		       wb->nr_segments,
1945 | 		       (long long unsigned int)
1946 | 		       wb->current_seg->id,
1947 | 		       (long long unsigned int)
1948 | 		       atomic64_read(&wb->last_flushed_segment_id),
1949 | 		       (long long unsigned int)
1950 | 		       atomic64_read(&wb->last_writeback_segment_id),
1951 | 		       (long long unsigned int)
1952 | 		       atomic64_read(&wb->nr_dirty_caches));
1953 | 
1954 | 		for (i = 0; i < STATLEN; i++) {
1955 | 			atomic64_t *v = &wb->stat[i];
1956 | 			DMEMIT(" %llu", (unsigned long long) atomic64_read(v));
1957 | 		}
1958 | 		DMEMIT(" %llu", (unsigned long long) atomic64_read(&wb->count_non_full_flushed));
1959 | 
1960 | 		DMEMIT(" %d", 10);
1961 | 		DMEMIT(" writeback_threshold %d",
1962 | 		       wb->writeback_threshold);
1963 | 		DMEMIT(" nr_cur_batched_writeback %u",
1964 | 		       wb->nr_cur_batched_writeback);
1965 | 		DMEMIT(" sync_data_interval %lu",
1966 | 		       wb->sync_data_interval);
1967 | 		DMEMIT(" update_sb_record_interval %lu",
1968 | 		       wb->update_sb_record_interval);
1969 | 		DMEMIT(" read_cache_threshold %u",
1970 | 		       wb->read_cache_threshold);
1971 | 		break;
1972 | 
1973 | 	case STATUSTYPE_TABLE:
1974 | 		format_dev_t(buf, wb->backing_dev->bdev->bd_dev);
1975 | 		DMEMIT("%s", buf);
1976 | 		format_dev_t(buf, wb->cache_dev->bdev->bd_dev);
1977 | 		DMEMIT(" %s", buf);
1978 | 
1979 | 		for (i = 0; i < wb->nr_ctr_args; i++)
1980 | 			DMEMIT(" %s", wb->ctr_args[i]);
1981 | 		break;
1982 | 
1983 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0)
1984 | 	case STATUSTYPE_IMA:
1985 | 		*result = '\0';
1986 | 		break;
1987 | #endif
1988 | 	}
1989 | }
1990 | 
1991 | static struct target_type writeboost_target = {
1992 | 	.name = "writeboost",
1993 | 	.version = {2, 2, 19},
1994 | 	.module = THIS_MODULE,
1995 | 	.map = writeboost_map,
1996 | 	.end_io = writeboost_end_io,
1997 | 	.ctr = writeboost_ctr,
1998 | 	.dtr = writeboost_dtr,
1999 | 	.postsuspend = writeboost_postsuspend,
2000 | 	.message = writeboost_message,
2001 | 	.status = writeboost_status,
2002 | 	.io_hints = writeboost_io_hints,
2003 | 	.iterate_devices = writeboost_iterate_devices,
2004 | };
2005 | 
2006 | static int __init writeboost_module_init(void)
2007 | {
2008 | 	int err = 0;
2009 | 
2010 | 	err = dm_register_target(&writeboost_target);
2011 | 	if (err < 0) {
2012 | 		DMERR("Failed to register target");
2013 | 		return err;
2014 | 	}
2015 | 
2016 | 	return err;
2017 | }
2018 | 
2019 | static void __exit writeboost_module_exit(void)
2020 | {
2021 | 	dm_unregister_target(&writeboost_target);
2022 | }
2023 | 
2024 | module_init(writeboost_module_init);
2025 | module_exit(writeboost_module_exit);
2026 | 
2027 | MODULE_AUTHOR("Akira Hayakawa <ruby.wktk@gmail.com>");
2028 | MODULE_DESCRIPTION(DM_NAME " writeboost target");
2029 | MODULE_LICENSE("GPL");
2030 | 


--------------------------------------------------------------------------------