├── .gitignore ├── add-ext4-journal-lazy-mount-option ├── add-indirection-to-metadata-block-read-paths ├── add-journal-no-cleanup-option ├── add-support-for-log-metadata-block-tracking-in-log ├── archive ├── add-WARN_ON-with-unmapped-dirty-bh-in-writepage ├── auto-enable-journal_async_commit ├── avoid-unnecessarily-writing-back-dirty-pages-before-hole-punching ├── bio-debug ├── introduce-new-i_write_mutex └── jbd2-dont-write-non-commit-blocks-synchronously ├── cleaner ├── disable-writeback ├── jbd2-dont-double-bump-transaction-number ├── journal-superblock-changes ├── load-jmap-from-journal ├── old-patches ├── add-blkdiscard-ioctl ├── add-encryption-debug-files ├── add-fallocate-mode-blocking-for-debugging ├── add-squelch-errors-support ├── add-sysfs-bool-support ├── akpm-jbd2-locking-fix ├── block-dio-during-truncate ├── commit-as-soon-as-possible-after-log_start_commit ├── crypto-add-ciphertext_access-mount-option ├── crypto-add-ioctls-to-backup-crypto-metadata ├── crypto-rename-ext4_get_encryption_info ├── delalloc-debug ├── dont-use-io-end-if-not-needed ├── dump-in-use-buffers ├── include-mpage-functions-into-readpage.c ├── inline-ext4_get_block-into-readpage ├── move-read-page-functions-to-new-file ├── only-call-ext4_truncate-if-there-is-data-to-truncate ├── series └── use-discard-if-possible-in-blkdev_issue_zeroout ├── series ├── stable-boundary ├── stable-boundary-undo.patch └── timestamps /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | status 3 | 4 | -------------------------------------------------------------------------------- /add-ext4-journal-lazy-mount-option: -------------------------------------------------------------------------------- 1 | ext4: add journal_lazy mount option 2 | 3 | This option turns out the lazy journalling option, as described in the 4 | FAST 2017 paper, "Evolving Ext4 for Shingled Disks"[1]. 5 | 6 | [1] https://www.usenix.org/conference/fast17/technical-sessions/presentation/aghayev 7 | 8 | Signed-off-by: Theodore Ts'o 9 | --- 10 | fs/ext4/ext4.h | 1 + 11 | fs/ext4/inode.c | 2 +- 12 | fs/ext4/ioctl.c | 42 ++++++++++++++++++++++++++++++++---------- 13 | fs/ext4/super.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------ 14 | 4 files changed, 78 insertions(+), 23 deletions(-) 15 | 16 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 17 | index fc2bdaa71c44..e19b6bac2d91 100644 18 | --- a/fs/ext4/ext4.h 19 | +++ b/fs/ext4/ext4.h 20 | @@ -1079,6 +1079,7 @@ struct ext4_inode_info { 21 | * Mount flags set via mount options or defaults 22 | */ 23 | #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ 24 | +#define EXT4_MOUNT_JOURNAL_LAZY 0x00002 /* Do lazy writeback of journalled metadata */ 25 | #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 26 | #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 27 | #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 28 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 29 | index 3969d0278469..d43c326f4048 100644 30 | --- a/fs/ext4/inode.c 31 | +++ b/fs/ext4/inode.c 32 | @@ -3287,7 +3287,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 33 | filemap_write_and_wait(mapping); 34 | } 35 | 36 | - if (EXT4_JOURNAL(inode) && 37 | + if (EXT4_JOURNAL(inode) && !test_opt(inode->i_sb, JOURNAL_LAZY) && 38 | ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 39 | /* 40 | * This is a REALLY heavyweight approach, but the use of 41 | diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c 42 | index a7074115d6f6..8556d6003d15 100644 43 | --- a/fs/ext4/ioctl.c 44 | +++ b/fs/ext4/ioctl.c 45 | @@ -239,6 +239,20 @@ static int ext4_ioctl_setflags(struct inode *inode, 46 | if (!capable(CAP_SYS_RESOURCE)) 47 | goto flags_out; 48 | } 49 | + 50 | + /* 51 | + * Clearing the JOURNAL_DATA flag is *hard* with lazy 52 | + * journalling. We can't use jbd2_journal_flush(); instead, 53 | + * we would have to make sure all blocks belonging to the file 54 | + * are evacuated from the journal and saved to their final 55 | + * location on disk. Punt for now. 56 | + */ 57 | + if ((oldflags & EXT4_JOURNAL_DATA_FL) && !jflag && 58 | + test_opt(inode->i_sb, JOURNAL_LAZY)) { 59 | + err = -EOPNOTSUPP; 60 | + goto flags_out; 61 | + } 62 | + 63 | if ((flags ^ oldflags) & EXT4_EXTENTS_FL) 64 | migrate = 1; 65 | 66 | @@ -626,6 +640,22 @@ static long ext4_ioctl_group_add(struct file *file, 67 | return err; 68 | } 69 | 70 | +/* 71 | + * If we are using journalling (excepting JBD2 lazy mode), make sure 72 | + * the block group descriptors are written out immediately 73 | + */ 74 | +static int flush_fs_group_descriptors(struct super_block *sb) 75 | +{ 76 | + int err = 0; 77 | + 78 | + if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) { 79 | + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 80 | + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 81 | + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 82 | + } 83 | + return err; 84 | +} 85 | + 86 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 87 | { 88 | struct inode *inode = file_inode(filp); 89 | @@ -744,11 +774,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 90 | goto group_extend_out; 91 | 92 | err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 93 | - if (EXT4_SB(sb)->s_journal) { 94 | - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 95 | - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 96 | - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 97 | - } 98 | + err2 = flush_fs_group_descriptors(sb); 99 | if (err == 0) 100 | err = err2; 101 | mnt_drop_write_file(filp); 102 | @@ -886,11 +912,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 103 | goto resizefs_out; 104 | 105 | err = ext4_resize_fs(sb, n_blocks_count); 106 | - if (EXT4_SB(sb)->s_journal) { 107 | - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 108 | - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 109 | - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 110 | - } 111 | + err2 = flush_fs_group_descriptors(sb); 112 | if (err == 0) 113 | err = err2; 114 | mnt_drop_write_file(filp); 115 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 116 | index 9339717b85c8..406e4d4ffae0 100644 117 | --- a/fs/ext4/super.c 118 | +++ b/fs/ext4/super.c 119 | @@ -928,7 +928,8 @@ static void ext4_put_super(struct super_block *sb) 120 | ext4_mb_release(sb); 121 | ext4_ext_release(sb); 122 | 123 | - if (!sb_rdonly(sb) && !aborted && !test_opt(sb, JOURNAL_NOCLEANUP)) { 124 | + if (!sb_rdonly(sb) && !aborted && !test_opt(sb, JOURNAL_NOCLEANUP) && 125 | + !test_opt(sb, JOURNAL_LAZY)) { 126 | ext4_clear_feature_journal_needs_recovery(sb); 127 | es->s_state = cpu_to_le16(sbi->s_mount_state); 128 | } 129 | @@ -1384,6 +1385,7 @@ enum { 130 | Opt_inode_readahead_blks, Opt_journal_ioprio, 131 | Opt_dioread_nolock, Opt_dioread_lock, 132 | Opt_journal_nocleanup, Opt_journal_cleanup, 133 | + Opt_journal_nolazy, Opt_journal_lazy, 134 | Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 135 | Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 136 | }; 137 | @@ -1474,6 +1476,8 @@ static const match_table_t tokens = { 138 | {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 139 | {Opt_journal_nocleanup, "journal_nocleanup"}, 140 | {Opt_journal_cleanup, "journal_cleanup"}, 141 | + {Opt_journal_lazy, "journal_lazy"}, 142 | + {Opt_journal_nolazy, "journal_nolazy"}, 143 | {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 144 | {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 145 | {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 146 | @@ -1686,6 +1690,8 @@ static const struct mount_opts { 147 | {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 148 | {Opt_journal_nocleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_SET}, 149 | {Opt_journal_cleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_CLEAR}, 150 | + {Opt_journal_lazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_SET}, 151 | + {Opt_journal_nolazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_CLEAR}, 152 | {Opt_err, 0, 0} 153 | }; 154 | 155 | @@ -4570,6 +4576,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) 156 | journal->j_flags |= JBD2_NO_CLEANUP; 157 | else 158 | journal->j_flags &= ~JBD2_NO_CLEANUP; 159 | + if (test_opt(sb, JOURNAL_LAZY)) 160 | + journal->j_flags |= JBD2_LAZY; 161 | + else 162 | + journal->j_flags &= ~JBD2_LAZY; 163 | write_unlock(&journal->j_state_lock); 164 | } 165 | 166 | @@ -4804,6 +4814,24 @@ static int ext4_load_journal(struct super_block *sb, 167 | 168 | EXT4_SB(sb)->s_journal = journal; 169 | ext4_clear_journal_err(sb, es); 170 | + 171 | + if (test_opt(sb, JOURNAL_LAZY)) { 172 | + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 173 | + 174 | + /* Read the latest version of the superblock from the journal */ 175 | + lock_buffer(sbh); 176 | + clear_buffer_uptodate(sbh); 177 | + err = jbd2_bh_submit_read(journal, sbh, __func__); 178 | + if (err) { 179 | + ext4_msg(sb, KERN_ERR, "error rereading superblock %d", 180 | + err); 181 | + set_buffer_uptodate(sbh); 182 | + } 183 | + if (!ext4_superblock_csum_verify(sb, es)) 184 | + ext4_msg(sb, KERN_ERR, 185 | + "superblock csum doesn't verify" 186 | + "after journal replay!"); 187 | + } 188 | return 0; 189 | } 190 | 191 | @@ -4894,6 +4922,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb, 192 | { 193 | journal_t *journal = EXT4_SB(sb)->s_journal; 194 | 195 | + if (test_opt(sb, JOURNAL_LAZY)) 196 | + return; 197 | + 198 | if (!ext4_has_feature_journal(sb)) { 199 | BUG_ON(journal != NULL); 200 | return; 201 | @@ -5029,21 +5060,20 @@ static int ext4_freeze(struct super_block *sb) 202 | journal = EXT4_SB(sb)->s_journal; 203 | 204 | if (journal) { 205 | - /* Now we set up the journal barrier. */ 206 | - jbd2_journal_lock_updates(journal); 207 | - 208 | /* 209 | - * Don't clear the needs_recovery flag if we failed to 210 | - * flush the journal. 211 | + * Set the journal barrier, then flush the journal and 212 | + * clear the needs_recovery flag if we are not in 213 | + * JBD2_LAZY mode. 214 | */ 215 | - error = jbd2_journal_flush(journal); 216 | - if (error < 0) 217 | - goto out; 218 | + jbd2_journal_lock_updates(journal); 219 | 220 | - /* Journal blocked and flushed, clear needs_recovery flag. */ 221 | + if (!test_opt(sb, JOURNAL_LAZY)) { 222 | + error = jbd2_journal_flush(journal); 223 | + if (error < 0) 224 | + goto out; 225 | + } 226 | ext4_clear_feature_journal_needs_recovery(sb); 227 | } 228 | - 229 | error = ext4_commit_super(sb, 1); 230 | out: 231 | if (journal) 232 | @@ -5061,7 +5091,7 @@ static int ext4_unfreeze(struct super_block *sb) 233 | if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) 234 | return 0; 235 | 236 | - if (EXT4_SB(sb)->s_journal) { 237 | + if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) { 238 | /* Reset the needs_recovery flag before the fs is unlocked. */ 239 | ext4_set_feature_journal_needs_recovery(sb); 240 | } 241 | @@ -5595,6 +5625,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, 242 | * We don't need to lock updates but journal_flush() could 243 | * otherwise be livelocked... 244 | */ 245 | + if (test_opt(sb, JOURNAL_LAZY)) 246 | + return -EOPNOTSUPP; 247 | jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 248 | err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 249 | jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 250 | -------------------------------------------------------------------------------- /add-indirection-to-metadata-block-read-paths: -------------------------------------------------------------------------------- 1 | Add indirection to metadata read paths 2 | 3 | From: Abutalib Aghayev 4 | 5 | Change all metadata block reads to use jmap-aware function that first looks 6 | up the metadata block in the jmap. If lookup is successful, the function 7 | reads the corresponding log block from the journal and copies it to the 8 | metadata block buffer head. Otherwise, it reads the metadata block from 9 | the file system, just like standard jmap-unaware function. 10 | 11 | Signed-off-by: Abutalib Aghayev 12 | Signed-off-by: Theodore Ts'o 13 | --- 14 | fs/ext4/extents.c | 3 ++- 15 | fs/ext4/ialloc.c | 5 ++++- 16 | fs/ext4/indirect.c | 3 ++- 17 | fs/ext4/inode.c | 20 ++++++++++++++------ 18 | fs/ext4/move_extent.c | 3 ++- 19 | fs/ext4/resize.c | 4 +++- 20 | 6 files changed, 27 insertions(+), 11 deletions(-) 21 | 22 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c 23 | index 97f0fd06728d..47914c6a2556 100644 24 | --- a/fs/ext4/extents.c 25 | +++ b/fs/ext4/extents.c 26 | @@ -517,6 +517,7 @@ __read_extent_tree_block(const char *function, unsigned int line, 27 | { 28 | struct buffer_head *bh; 29 | int err; 30 | + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 31 | 32 | bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); 33 | if (unlikely(!bh)) 34 | @@ -524,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line, 35 | 36 | if (!bh_uptodate_or_lock(bh)) { 37 | trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); 38 | - err = bh_submit_read(bh); 39 | + err = jbd2_bh_submit_read(journal, bh, __func__); 40 | if (err < 0) 41 | goto errout; 42 | } 43 | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c 44 | index 507bfb3344d4..1c3447629e76 100644 45 | --- a/fs/ext4/ialloc.c 46 | +++ b/fs/ext4/ialloc.c 47 | @@ -14,6 +14,7 @@ 48 | 49 | #include 50 | #include 51 | +#include 52 | #include 53 | #include 54 | #include 55 | @@ -162,6 +163,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) 56 | struct buffer_head *bh = NULL; 57 | ext4_fsblk_t bitmap_blk; 58 | int err; 59 | + journal_t *journal = EXT4_SB(sb)->s_journal; 60 | 61 | desc = ext4_get_group_desc(sb, block_group, NULL); 62 | if (!desc) 63 | @@ -216,7 +218,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) 64 | trace_ext4_load_inode_bitmap(sb, block_group); 65 | bh->b_end_io = ext4_end_bitmap_read; 66 | get_bh(bh); 67 | - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); 68 | + jbd2_submit_bh(journal, REQ_OP_READ, REQ_META | REQ_PRIO, bh, __func__); 69 | + 70 | wait_on_buffer(bh); 71 | if (!buffer_uptodate(bh)) { 72 | put_bh(bh); 73 | diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c 74 | index 7ffa290cbb8e..06a79f5e563e 100644 75 | --- a/fs/ext4/indirect.c 76 | +++ b/fs/ext4/indirect.c 77 | @@ -145,6 +145,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, 78 | Indirect chain[4], int *err) 79 | { 80 | struct super_block *sb = inode->i_sb; 81 | + journal_t *journal = EXT4_SB(sb)->s_journal; 82 | Indirect *p = chain; 83 | struct buffer_head *bh; 84 | int ret = -EIO; 85 | @@ -162,7 +163,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, 86 | } 87 | 88 | if (!bh_uptodate_or_lock(bh)) { 89 | - if (bh_submit_read(bh) < 0) { 90 | + if (jbd2_bh_submit_read(journal, bh, __func__) < 0) { 91 | put_bh(bh); 92 | goto failure; 93 | } 94 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 95 | index c774bdc22759..a56e717b39be 100644 96 | --- a/fs/ext4/inode.c 97 | +++ b/fs/ext4/inode.c 98 | @@ -1001,13 +1001,15 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 99 | ext4_lblk_t block, int map_flags) 100 | { 101 | struct buffer_head *bh; 102 | + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 103 | 104 | bh = ext4_getblk(handle, inode, block, map_flags); 105 | if (IS_ERR(bh)) 106 | return bh; 107 | if (!bh || buffer_uptodate(bh)) 108 | return bh; 109 | - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); 110 | + jbd2_ll_rw_block(journal, REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh, 111 | + __func__); 112 | wait_on_buffer(bh); 113 | if (buffer_uptodate(bh)) 114 | return bh; 115 | @@ -1020,6 +1022,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, 116 | bool wait, struct buffer_head **bhs) 117 | { 118 | int i, err; 119 | + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 120 | 121 | for (i = 0; i < bh_count; i++) { 122 | bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); 123 | @@ -1033,8 +1036,9 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, 124 | for (i = 0; i < bh_count; i++) 125 | /* Note that NULL bhs[i] is valid because of holes. */ 126 | if (bhs[i] && !buffer_uptodate(bhs[i])) 127 | - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, 128 | - &bhs[i]); 129 | + jbd2_ll_rw_block(journal, REQ_OP_READ, 130 | + REQ_META | REQ_PRIO, 1, &bhs[i], 131 | + __func__); 132 | 133 | if (!wait) 134 | return 0; 135 | @@ -4448,6 +4452,7 @@ static int __ext4_get_inode_loc(struct inode *inode, 136 | struct super_block *sb = inode->i_sb; 137 | ext4_fsblk_t block; 138 | int inodes_per_block, inode_offset; 139 | + journal_t *journal = EXT4_SB(sb)->s_journal; 140 | 141 | iloc->bh = NULL; 142 | if (!ext4_valid_inum(sb, inode->i_ino)) 143 | @@ -4551,8 +4556,10 @@ static int __ext4_get_inode_loc(struct inode *inode, 144 | table += num / inodes_per_block; 145 | if (end > table) 146 | end = table; 147 | - while (b <= end) 148 | - sb_breadahead(sb, b++); 149 | + if (journal) { 150 | + while (b <= end) 151 | + jbd2_sb_breadahead(journal, sb, b++); 152 | + } 153 | } 154 | 155 | /* 156 | @@ -4563,7 +4570,8 @@ static int __ext4_get_inode_loc(struct inode *inode, 157 | trace_ext4_load_inode(inode); 158 | get_bh(bh); 159 | bh->b_end_io = end_buffer_read_sync; 160 | - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); 161 | + jbd2_submit_bh(journal, REQ_OP_READ, REQ_META | REQ_PRIO, bh, 162 | + __func__); 163 | wait_on_buffer(bh); 164 | if (!buffer_uptodate(bh)) { 165 | EXT4_ERROR_INODE_BLOCK(inode, block, 166 | diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c 167 | index 9bb36909ec92..0f6c00d0df17 100644 168 | --- a/fs/ext4/move_extent.c 169 | +++ b/fs/ext4/move_extent.c 170 | @@ -177,6 +177,7 @@ static int 171 | mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) 172 | { 173 | struct inode *inode = page->mapping->host; 174 | + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 175 | sector_t block; 176 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 177 | unsigned int blocksize, block_start, block_end; 178 | @@ -225,7 +226,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) 179 | for (i = 0; i < nr; i++) { 180 | bh = arr[i]; 181 | if (!bh_uptodate_or_lock(bh)) { 182 | - err = bh_submit_read(bh); 183 | + err = jbd2_bh_submit_read(journal, bh, __func__); 184 | if (err) 185 | return err; 186 | } 187 | diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c 188 | index 035cd3f4785e..5c817953053b 100644 189 | --- a/fs/ext4/resize.c 190 | +++ b/fs/ext4/resize.c 191 | @@ -1193,10 +1193,12 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, 192 | static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) 193 | { 194 | struct buffer_head *bh = sb_getblk(sb, block); 195 | + journal_t *journal = EXT4_SB(sb)->s_journal; 196 | + 197 | if (unlikely(!bh)) 198 | return NULL; 199 | if (!bh_uptodate_or_lock(bh)) { 200 | - if (bh_submit_read(bh) < 0) { 201 | + if (jbd2_bh_submit_read(journal, bh, __func__) < 0) { 202 | brelse(bh); 203 | return NULL; 204 | } 205 | -------------------------------------------------------------------------------- /add-journal-no-cleanup-option: -------------------------------------------------------------------------------- 1 | ext4, jbd2: add the journal_nocleanup mount option 2 | 3 | This debugging option is useful for generating test cases for the 4 | journal replay code. 5 | 6 | Signed-off-by: Theodore Ts'o 7 | --- 8 | fs/ext4/ext4.h | 1 + 9 | fs/ext4/super.c | 11 ++++++++++- 10 | fs/jbd2/journal.c | 12 +++++++++--- 11 | include/linux/jbd2.h | 1 + 12 | 4 files changed, 21 insertions(+), 4 deletions(-) 13 | 14 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 15 | index 7c7123f265c2..fc2bdaa71c44 100644 16 | --- a/fs/ext4/ext4.h 17 | +++ b/fs/ext4/ext4.h 18 | @@ -1115,6 +1115,7 @@ struct ext4_inode_info { 19 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 20 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 21 | #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ 22 | +#define EXT4_MOUNT_JOURNAL_NOCLEANUP 0x4000000 /* Preserve the journal on unmount */ 23 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 24 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 25 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 26 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 27 | index ae86983cbf60..9339717b85c8 100644 28 | --- a/fs/ext4/super.c 29 | +++ b/fs/ext4/super.c 30 | @@ -928,7 +928,7 @@ static void ext4_put_super(struct super_block *sb) 31 | ext4_mb_release(sb); 32 | ext4_ext_release(sb); 33 | 34 | - if (!sb_rdonly(sb) && !aborted) { 35 | + if (!sb_rdonly(sb) && !aborted && !test_opt(sb, JOURNAL_NOCLEANUP)) { 36 | ext4_clear_feature_journal_needs_recovery(sb); 37 | es->s_state = cpu_to_le16(sbi->s_mount_state); 38 | } 39 | @@ -1383,6 +1383,7 @@ enum { 40 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 41 | Opt_inode_readahead_blks, Opt_journal_ioprio, 42 | Opt_dioread_nolock, Opt_dioread_lock, 43 | + Opt_journal_nocleanup, Opt_journal_cleanup, 44 | Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 45 | Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 46 | }; 47 | @@ -1471,6 +1472,8 @@ static const match_table_t tokens = { 48 | {Opt_test_dummy_encryption, "test_dummy_encryption"}, 49 | {Opt_nombcache, "nombcache"}, 50 | {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 51 | + {Opt_journal_nocleanup, "journal_nocleanup"}, 52 | + {Opt_journal_cleanup, "journal_cleanup"}, 53 | {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 54 | {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 55 | {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 56 | @@ -1681,6 +1684,8 @@ static const struct mount_opts { 57 | {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 58 | {Opt_test_dummy_encryption, 0, MOPT_GTE0}, 59 | {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 60 | + {Opt_journal_nocleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_SET}, 61 | + {Opt_journal_cleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_CLEAR}, 62 | {Opt_err, 0, 0} 63 | }; 64 | 65 | @@ -4561,6 +4566,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) 66 | journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 67 | else 68 | journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 69 | + if (test_opt(sb, JOURNAL_NOCLEANUP)) 70 | + journal->j_flags |= JBD2_NO_CLEANUP; 71 | + else 72 | + journal->j_flags &= ~JBD2_NO_CLEANUP; 73 | write_unlock(&journal->j_state_lock); 74 | } 75 | 76 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c 77 | index 8ef6b6daaa7a..860ab3c802a4 100644 78 | --- a/fs/jbd2/journal.c 79 | +++ b/fs/jbd2/journal.c 80 | @@ -1727,6 +1727,11 @@ int jbd2_journal_destroy(journal_t *journal) 81 | if (journal->j_running_transaction) 82 | jbd2_journal_commit_transaction(journal); 83 | 84 | + if (journal->j_flags & JBD2_NO_CLEANUP) { 85 | + jbd2_journal_destroy_checkpoint(journal); 86 | + journal->j_checkpoint_transactions = NULL; 87 | + } 88 | + 89 | /* Force any old transactions to disk */ 90 | 91 | /* Totally anal locking here... */ 92 | @@ -1754,7 +1759,9 @@ int jbd2_journal_destroy(journal_t *journal) 93 | spin_unlock(&journal->j_list_lock); 94 | 95 | if (journal->j_sb_buffer) { 96 | - if (!is_journal_aborted(journal)) { 97 | + if (is_journal_aborted(journal)) 98 | + err = -EIO; 99 | + else if ((journal->j_flags & JBD2_NO_CLEANUP) == 0) { 100 | mutex_lock_io(&journal->j_checkpoint_mutex); 101 | 102 | write_lock(&journal->j_state_lock); 103 | @@ -1765,8 +1772,7 @@ int jbd2_journal_destroy(journal_t *journal) 104 | jbd2_mark_journal_empty(journal, 105 | REQ_SYNC | REQ_PREFLUSH | REQ_FUA); 106 | mutex_unlock(&journal->j_checkpoint_mutex); 107 | - } else 108 | - err = -EIO; 109 | + } 110 | brelse(journal->j_sb_buffer); 111 | } 112 | 113 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h 114 | index b708e5169d1d..81fa9fa7ce9c 100644 115 | --- a/include/linux/jbd2.h 116 | +++ b/include/linux/jbd2.h 117 | @@ -1235,6 +1235,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) 118 | * data write error in ordered 119 | * mode */ 120 | #define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */ 121 | +#define JBD2_NO_CLEANUP 0x100 /* Don't flush empty the journal on shutdown */ 122 | 123 | /* 124 | * Function declarations for the journaling transaction and buffer 125 | -------------------------------------------------------------------------------- /archive/add-WARN_ON-with-unmapped-dirty-bh-in-writepage: -------------------------------------------------------------------------------- 1 | ext4: add WARN_ON on unmapped dirty buffer_heads in writepage 2 | 3 | From: "Aneesh Kumar K.V" 4 | 5 | Now that block_lock_hole_extend() clears the dirty flag of 6 | buffer_heads outside i_size we should not find buffer_heads which are 7 | unmapped and dirty in writepage. If we find do a WARN_ON. We can 8 | still continue because block_write_full page look at the mapped flag 9 | only. 10 | 11 | Following sequence of events would result in the above condition. 12 | 1) truncate(f, 1024) 13 | 2) mmap(f, 0, 4096) 14 | 3) a[0] = 'a' 15 | 4) truncate(f, 4096) 16 | 5) writepage(...) 17 | 18 | After step 3 we would have unmapped buffer_heads outside i_size. 19 | After step 4 we would have unmapped buffer_heads within i_size. 20 | 21 | Now that truncate is calling block_lock_hole_extend which in turn 22 | is clearing the dirty flag, we can safely assume that we won't 23 | find unmapped dirty buffer_heads in write page. If we did find one 24 | we should find out why. 25 | 26 | Signed-off-by: Aneesh Kumar K.V 27 | Acked-by: Jan Kara 28 | Signed-off-by: "Theodore Ts'o" 29 | --- 30 | fs/ext4/inode.c | 12 ++++++++++++ 31 | 1 files changed, 12 insertions(+), 0 deletions(-) 32 | 33 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 34 | index 2219daa..9bba474 100644 35 | --- a/fs/ext4/inode.c 36 | +++ b/fs/ext4/inode.c 37 | @@ -2488,6 +2488,10 @@ static int __ext4_journalled_writepage(struct page *page, 38 | return ret; 39 | } 40 | 41 | +static int ext4_bh_unmapped_and_dirty(handle_t *handle, struct buffer_head *bh) 42 | +{ 43 | + return !buffer_mapped(bh) && buffer_dirty(bh); 44 | +} 45 | 46 | /* 47 | * Note that we don't need to start a transaction unless we're journaling data 48 | @@ -2602,6 +2606,14 @@ static int ext4_writepage(struct page *page, 49 | /* now mark the buffer_heads as dirty and uptodate */ 50 | block_commit_write(page, 0, len); 51 | } 52 | + /* 53 | + * There should not be any unmapped and dirty 54 | + * buffer_heads at this point. Look at block_lock_hole_extend 55 | + * for more info. If we find one print more info 56 | + */ 57 | + WARN(walk_page_buffers(NULL, page_bufs, 0, len, NULL, 58 | + ext4_bh_unmapped_and_dirty), 59 | + "Unmapped dirty buffer_heads found in %s\n", __func__); 60 | 61 | if (PageChecked(page) && ext4_should_journal_data(inode)) { 62 | /* 63 | -- 64 | 1.6.3.1.244.gf9275 65 | 66 | -- 67 | To unsubscribe from this list: send the line "unsubscribe linux-ext4" in 68 | the body of a message to majordomo@vger.kernel.org 69 | More majordomo info at http://vger.kernel.org/majordomo-info.html 70 | 71 | -------------------------------------------------------------------------------- /archive/auto-enable-journal_async_commit: -------------------------------------------------------------------------------- 1 | ext4: automatically enable journal_async_commit on ext4 file systems 2 | 3 | Now that we have cleaned up journal_async_commit, it's safe to enable 4 | it all the time. But we only want to do so if ext4-specific INCOMPAT 5 | features are enabled, since otherwise we will prevent the filesystem 6 | from being mounted using ext3. 7 | 8 | Signed-off-by: "Theodore Ts'o" 9 | 10 | --- 11 | Documentation/filesystems/ext4.txt | 11 ++++++++--- 12 | fs/ext4/super.c | 29 ++++++++++++++++++++++++++--- 13 | 2 files changed, 34 insertions(+), 6 deletions(-) 14 | 15 | diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt 16 | index 6ab9442..713f5d5 100644 17 | --- a/Documentation/filesystems/ext4.txt 18 | +++ b/Documentation/filesystems/ext4.txt 19 | @@ -140,9 +140,14 @@ journal_checksum Enable checksumming of the journal transactions. 20 | compatible change and will be ignored by older kernels. 21 | 22 | journal_async_commit Commit block can be written to disk without waiting 23 | - for descriptor blocks. If enabled older kernels cannot 24 | - mount the device. This will enable 'journal_checksum' 25 | - internally. 26 | + for descriptor blocks. This will enable 27 | + 'journal_checksum' internally. This mount 28 | + option will be automatically enabled if 29 | + ext4-specific INCOMPAT features are present in 30 | + the file system. 31 | + 32 | +nojournal_async_commit Disable the journal_async_commit option, even 33 | + for ext4 filesystems. 34 | 35 | journal=update Update the ext4 file system's journal to the current 36 | format. 37 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 38 | index 9706981..d719551 100644 39 | --- a/fs/ext4/super.c 40 | +++ b/fs/ext4/super.c 41 | @@ -203,6 +203,15 @@ void ext4_itable_unused_set(struct super_block *sb, 42 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 43 | } 44 | 45 | +/* 46 | + * If ext4 filesystem features are enabled, then enable async_commits 47 | + * by default. 48 | + */ 49 | +#define ASYNC_COMMIT_DEFAULT(sb) (EXT4_HAS_INCOMPAT_FEATURE(sb, \ 50 | + (EXT4_FEATURE_INCOMPAT_EXTENTS| \ 51 | + EXT4_FEATURE_INCOMPAT_64BIT| \ 52 | + EXT4_FEATURE_INCOMPAT_FLEX_BG))) 53 | + 54 | 55 | /* Just increment the non-pointer handle value */ 56 | static handle_t *ext4_get_nojournal(void) 57 | @@ -1020,9 +1029,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) 58 | */ 59 | seq_puts(seq, ",barrier="); 60 | seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 61 | - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 62 | - seq_puts(seq, ",journal_async_commit"); 63 | - else if (test_opt(sb, JOURNAL_CHECKSUM)) 64 | + if (ASYNC_COMMIT_DEFAULT(sb)) { 65 | + if (!test_opt(sb, JOURNAL_ASYNC_COMMIT)) 66 | + seq_puts(seq, ",nojournal_async_commit"); 67 | + } else { 68 | + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 69 | + seq_puts(seq, ",journal_async_commit"); 70 | + } 71 | + if (test_opt(sb, JOURNAL_CHECKSUM) && 72 | + !test_opt(sb, JOURNAL_ASYNC_COMMIT)) 73 | seq_puts(seq, ",journal_checksum"); 74 | if (test_opt(sb, I_VERSION)) 75 | seq_puts(seq, ",i_version"); 76 | @@ -1239,6 +1254,7 @@ enum { 77 | Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 78 | Opt_journal_update, Opt_journal_dev, 79 | Opt_journal_checksum, Opt_journal_async_commit, 80 | + Opt_nojournal_async_commit, 81 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 82 | Opt_data_err_abort, Opt_data_err_ignore, 83 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 84 | @@ -1285,6 +1301,7 @@ static const match_table_t tokens = { 85 | {Opt_journal_dev, "journal_dev=%u"}, 86 | {Opt_journal_checksum, "journal_checksum"}, 87 | {Opt_journal_async_commit, "journal_async_commit"}, 88 | + {Opt_nojournal_async_commit, "nojournal_async_commit"}, 89 | {Opt_abort, "abort"}, 90 | {Opt_data_journal, "data=journal"}, 91 | {Opt_data_ordered, "data=ordered"}, 92 | @@ -1559,6 +1576,9 @@ static int parse_options(char *options, struct super_block *sb, 93 | set_opt(sb, JOURNAL_ASYNC_COMMIT); 94 | set_opt(sb, JOURNAL_CHECKSUM); 95 | break; 96 | + case Opt_nojournal_async_commit: 97 | + clear_opt(sb, JOURNAL_ASYNC_COMMIT); 98 | + break; 99 | case Opt_noload: 100 | set_opt(sb, NOLOAD); 101 | break; 102 | @@ -3161,6 +3181,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) 103 | ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 104 | set_opt(sb, DELALLOC); 105 | 106 | + if (ASYNC_COMMIT_DEFAULT(sb)) 107 | + set_opt(sb, JOURNAL_ASYNC_COMMIT); 108 | + 109 | if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 110 | &journal_devnum, &journal_ioprio, NULL, 0)) { 111 | ext4_msg(sb, KERN_WARNING, 112 | -------------------------------------------------------------------------------- /archive/avoid-unnecessarily-writing-back-dirty-pages-before-hole-punching: -------------------------------------------------------------------------------- 1 | ext4: avoid unnecessarily writing back dirty pages before hole punching 2 | 3 | From: Li Wang 4 | 5 | For hole punching, currently ext4 will synchronously write back the 6 | dirty pages fit into the hole, since the data on the disk responding 7 | to those pages are to be deleted, it is benefical to directly release 8 | those pages, no matter they are dirty or not, except the ordered case. 9 | 10 | [ Fixed error return to unlock i_mutex if ext4_begin_ordered_punch_hole() 11 | fails. Thanks to Wei Yongjun for 12 | pointing this out.] 13 | 14 | Signed-off-by: Li Wang 15 | Signed-off-by: Yunchuan Wen 16 | Signed-off-by: "Theodore Ts'o" 17 | Cc: Dmitry Monakhov 18 | Reviewed-by: Zheng Liu 19 | Reviewed-by: Jan Kara 20 | --- 21 | fs/ext4/inode.c | 28 ++++++++++++++++------------ 22 | fs/jbd2/journal.c | 2 +- 23 | fs/jbd2/transaction.c | 29 ++++++----------------------- 24 | include/linux/jbd2.h | 33 +++++++++++++++++++++++++++++++-- 25 | 4 files changed, 54 insertions(+), 38 deletions(-) 26 | 27 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 28 | index 0db830d..06136b5 100644 29 | --- a/fs/ext4/inode.c 30 | +++ b/fs/ext4/inode.c 31 | @@ -3466,6 +3466,16 @@ int ext4_can_truncate(struct inode *inode) 32 | return 0; 33 | } 34 | 35 | +static inline int ext4_begin_ordered_punch_hole(struct inode *inode, 36 | + loff_t start, loff_t length) 37 | +{ 38 | + if (!EXT4_I(inode)->jinode) 39 | + return 0; 40 | + return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode), 41 | + EXT4_I(inode)->jinode, 42 | + start, start+length-1); 43 | +} 44 | + 45 | /* 46 | * ext4_punch_hole: punches a hole in a file by releaseing the blocks 47 | * associated with the given offset and length 48 | @@ -3482,7 +3492,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 49 | struct inode *inode = file_inode(file); 50 | struct super_block *sb = inode->i_sb; 51 | ext4_lblk_t first_block, stop_block; 52 | - struct address_space *mapping = inode->i_mapping; 53 | loff_t first_block_offset, last_block_offset; 54 | handle_t *handle; 55 | unsigned int credits; 56 | @@ -3498,17 +3507,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 57 | 58 | trace_ext4_punch_hole(inode, offset, length); 59 | 60 | - /* 61 | - * Write out all dirty pages to avoid race conditions 62 | - * Then release them. 63 | - */ 64 | - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 65 | - ret = filemap_write_and_wait_range(mapping, offset, 66 | - offset + length - 1); 67 | - if (ret) 68 | - return ret; 69 | - } 70 | - 71 | mutex_lock(&inode->i_mutex); 72 | /* It's not possible punch hole on append only file */ 73 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 74 | @@ -3537,6 +3535,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 75 | first_block_offset = round_up(offset, sb->s_blocksize); 76 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 77 | 78 | + if (ext4_should_order_data(inode)) { 79 | + ret = ext4_begin_ordered_punch_hole(inode, offset, length); 80 | + if (ret) 81 | + goto out_mutex; 82 | + } 83 | + 84 | /* Now release the pages and zero block aligned part of pages*/ 85 | if (last_block_offset > first_block_offset) 86 | truncate_pagecache_range(inode, first_block_offset, 87 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c 88 | index 915dd57..4c8b8d4 100644 89 | --- a/fs/jbd2/journal.c 90 | +++ b/fs/jbd2/journal.c 91 | @@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit); 92 | EXPORT_SYMBOL(jbd2_journal_file_inode); 93 | EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94 | EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 95 | -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 96 | +EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole); 97 | EXPORT_SYMBOL(jbd2_inode_cache); 98 | 99 | static void __journal_abort_soft (journal_t *journal, int errno); 100 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c 101 | index dd422e6..91d62e1 100644 102 | --- a/fs/jbd2/transaction.c 103 | +++ b/fs/jbd2/transaction.c 104 | @@ -2419,29 +2419,10 @@ done: 105 | return 0; 106 | } 107 | 108 | -/* 109 | - * File truncate and transaction commit interact with each other in a 110 | - * non-trivial way. If a transaction writing data block A is 111 | - * committing, we cannot discard the data by truncate until we have 112 | - * written them. Otherwise if we crashed after the transaction with 113 | - * write has committed but before the transaction with truncate has 114 | - * committed, we could see stale data in block A. This function is a 115 | - * helper to solve this problem. It starts writeout of the truncated 116 | - * part in case it is in the committing transaction. 117 | - * 118 | - * Filesystem code must call this function when inode is journaled in 119 | - * ordered mode before truncation happens and after the inode has been 120 | - * placed on orphan list with the new inode size. The second condition 121 | - * avoids the race that someone writes new data and we start 122 | - * committing the transaction after this function has been called but 123 | - * before a transaction for truncate is started (and furthermore it 124 | - * allows us to optimize the case where the addition to orphan list 125 | - * happens in the same transaction as write --- we don't have to write 126 | - * any data in such case). 127 | - */ 128 | -int jbd2_journal_begin_ordered_truncate(journal_t *journal, 129 | + 130 | +int jbd2_journal_begin_ordered_punch_hole(journal_t *journal, 131 | struct jbd2_inode *jinode, 132 | - loff_t new_size) 133 | + loff_t start, loff_t end) 134 | { 135 | transaction_t *inode_trans, *commit_trans; 136 | int ret = 0; 137 | @@ -2460,10 +2441,12 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, 138 | spin_unlock(&journal->j_list_lock); 139 | if (inode_trans == commit_trans) { 140 | ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, 141 | - new_size, LLONG_MAX); 142 | + start, end); 143 | if (ret) 144 | jbd2_journal_abort(journal, ret); 145 | } 146 | out: 147 | return ret; 148 | } 149 | + 150 | + 151 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h 152 | index 0302f3f..5f3c094 100644 153 | --- a/include/linux/jbd2.h 154 | +++ b/include/linux/jbd2.h 155 | @@ -1157,12 +1157,41 @@ extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long * 156 | extern int jbd2_journal_force_commit(journal_t *); 157 | extern int jbd2_journal_force_commit_nested(journal_t *); 158 | extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); 159 | -extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, 160 | - struct jbd2_inode *inode, loff_t new_size); 161 | +extern int jbd2_journal_begin_ordered_punch_hole(journal_t *, 162 | + struct jbd2_inode *, 163 | + loff_t, loff_t); 164 | extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); 165 | extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); 166 | 167 | /* 168 | + * File truncate and transaction commit interact with each other in a 169 | + * non-trivial way. If a transaction writing data block A is 170 | + * committing, we cannot discard the data by truncate until we have 171 | + * written them. Otherwise if we crashed after the transaction with 172 | + * write has committed but before the transaction with truncate has 173 | + * committed, we could see stale data in block A. This function is a 174 | + * helper to solve this problem. It starts writeout of the truncated 175 | + * part in case it is in the committing transaction. 176 | + * 177 | + * Filesystem code must call this function when inode is journaled in 178 | + * ordered mode before truncation happens and after the inode has been 179 | + * placed on orphan list with the new inode size. The second condition 180 | + * avoids the race that someone writes new data and we start 181 | + * committing the transaction after this function has been called but 182 | + * before a transaction for truncate is started (and furthermore it 183 | + * allows us to optimize the case where the addition to orphan list 184 | + * happens in the same transaction as write --- we don't have to write 185 | + * any data in such case). 186 | + */ 187 | +static inline int jbd2_journal_begin_ordered_truncate(journal_t *journal, 188 | + struct jbd2_inode *jinode, 189 | + loff_t new_size) 190 | +{ 191 | + return jbd2_journal_begin_ordered_punch_hole(journal, jinode, 192 | + new_size, LLONG_MAX); 193 | +} 194 | + 195 | +/* 196 | * journal_head management 197 | */ 198 | struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); 199 | -------------------------------------------------------------------------------- /archive/bio-debug: -------------------------------------------------------------------------------- 1 | ext4: debugging patches for the bio code 2 | 3 | --- 4 | block/blk-core.c | 17 ++++++++++ 5 | fs/buffer.c | 36 ++++++++++++++++++++++ 6 | fs/ext4/inode.c | 18 +++++++++++ 7 | fs/ext4/mballoc.c | 8 ++++- 8 | fs/ext4/page-io.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++- 9 | fs/jbd2/commit.c | 18 +++++++++++ 10 | fs/jbd2/transaction.c | 5 +++ 11 | mm/filemap.c | 7 ++++ 12 | 8 files changed, 186 insertions(+), 3 deletions(-) 13 | 14 | diff --git a/block/blk-core.c b/block/blk-core.c 15 | index 32a1c12..115574b 100644 16 | --- a/block/blk-core.c 17 | +++ b/block/blk-core.c 18 | @@ -248,6 +248,9 @@ int blk_remove_plug(struct request_queue *q) 19 | if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) 20 | return 0; 21 | 22 | +#if 1 /* PDEBUG */ 23 | + trace_printk("del timer %s\n", q->backing_dev_info.name); 24 | +#endif 25 | del_timer(&q->unplug_timer); 26 | return 1; 27 | } 28 | @@ -379,6 +382,9 @@ EXPORT_SYMBOL(blk_stop_queue); 29 | */ 30 | void blk_sync_queue(struct request_queue *q) 31 | { 32 | +#if 1 /* PDEBUG */ 33 | + trace_printk("del timer %s\n", q->backing_dev_info.name); 34 | +#endif 35 | del_timer_sync(&q->unplug_timer); 36 | del_timer_sync(&q->timeout); 37 | cancel_work_sync(&q->unplug_work); 38 | @@ -1525,6 +1531,17 @@ static inline void __generic_make_request(struct bio *bio) 39 | trace_block_bio_queue(q, bio); 40 | 41 | ret = q->make_request_fn(q, bio); 42 | + 43 | +#if 1 /* PDEBUG */ 44 | + { 45 | + char str[KSYM_SYMBOL_LEN]; 46 | + 47 | + kallsyms_lookup((unsigned long) q->make_request_fn, 48 | + NULL, NULL, NULL, str); 49 | + trace_printk("returned from %s (pid %d)\n", 50 | + str, task_pid_nr(current)); 51 | + } 52 | +#endif 53 | } while (ret); 54 | 55 | return; 56 | diff --git a/fs/buffer.c b/fs/buffer.c 57 | index 3e7dca2..ed188f5 100644 58 | --- a/fs/buffer.c 59 | +++ b/fs/buffer.c 60 | @@ -70,6 +70,19 @@ static int sync_buffer(void *word) 61 | 62 | void __lock_buffer(struct buffer_head *bh) 63 | { 64 | +#if 1 /* PDEBUG */ 65 | + void *ip = __builtin_return_address(0); 66 | + char str[KSYM_SYMBOL_LEN]; 67 | + char b[BDEVNAME_SIZE]; 68 | + 69 | + if (buffer_locked(bh)) { 70 | + kallsyms_lookup((unsigned long) ip, NULL, NULL, NULL, str); 71 | + trace_printk("lock bh %s blk %lu, ret_pc %p (%s) pid %d\n", 72 | + bdevname(bh->b_bdev, b), 73 | + (unsigned long) bh->b_blocknr, ip, str, 74 | + task_pid_nr(current)); 75 | + } 76 | +#endif 77 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 78 | TASK_UNINTERRUPTIBLE); 79 | } 80 | @@ -90,6 +103,17 @@ EXPORT_SYMBOL(unlock_buffer); 81 | */ 82 | void __wait_on_buffer(struct buffer_head * bh) 83 | { 84 | +#if 1 /* PDEBUG */ 85 | + void *ip = __builtin_return_address(0); 86 | + char str[KSYM_SYMBOL_LEN]; 87 | + char b[BDEVNAME_SIZE]; 88 | + 89 | + kallsyms_lookup((unsigned long) ip, NULL, NULL, NULL, str); 90 | + trace_printk("dev %s blk %lu, ret_pc %p (%s) pid %d\n", 91 | + bdevname(bh->b_bdev, b), 92 | + (unsigned long) bh->b_blocknr, ip, str, 93 | + task_pid_nr(current)); 94 | +#endif 95 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 96 | } 97 | EXPORT_SYMBOL(__wait_on_buffer); 98 | @@ -2906,6 +2930,18 @@ int submit_bh(int rw, struct buffer_head * bh) 99 | struct bio *bio; 100 | int ret = 0; 101 | 102 | +#if 1 /* PDEBUG */ 103 | + void *ip = __builtin_return_address(0); 104 | + char str[KSYM_SYMBOL_LEN]; 105 | + char b[BDEVNAME_SIZE]; 106 | + 107 | + kallsyms_lookup((unsigned long) ip, NULL, NULL, NULL, str); 108 | + trace_printk("dev %s blk %lu, ret_pc %p (%s) pid %d\n", 109 | + bdevname(bh->b_bdev, b), 110 | + (unsigned long) bh->b_blocknr, ip, str, 111 | + task_pid_nr(current)); 112 | +#endif 113 | + 114 | BUG_ON(!buffer_locked(bh)); 115 | BUG_ON(!buffer_mapped(bh)); 116 | BUG_ON(!bh->b_end_io); 117 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 118 | index 79fdace..82eafb9 100644 119 | --- a/fs/ext4/inode.c 120 | +++ b/fs/ext4/inode.c 121 | @@ -2029,6 +2029,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, 122 | index = mpd->first_page; 123 | end = mpd->next_page - 1; 124 | 125 | +#if 1 /* PDEBUG */ 126 | + trace_printk("%s: ino %lu index %lu end %lu size %lu\n", 127 | + inode->i_sb->s_id, inode->i_ino, 128 | + index, end, (unsigned long) size); 129 | +#endif 130 | pagevec_init(&pvec, 0); 131 | while (index <= end) { 132 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 133 | @@ -3654,6 +3659,9 @@ int flush_completed_IO(struct inode *inode) 134 | if (list_empty(&ei->i_completed_io_list)) 135 | return ret; 136 | 137 | +#if 1 /* PDEBUG */ 138 | + trace_printk("%s: ino %lu\n", inode->i_sb->s_id, inode->i_ino); 139 | +#endif 140 | dump_completed_IO(inode); 141 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); 142 | while (!list_empty(&ei->i_completed_io_list)){ 143 | @@ -3694,6 +3702,16 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 144 | unsigned long flags; 145 | struct ext4_inode_info *ei; 146 | 147 | +#if 1 /* PDEBUG */ 148 | + if (io_end) 149 | + trace_printk("%s: ino %lu io_end %p size %lu\n", 150 | + io_end->inode->i_sb->s_id, 151 | + io_end->inode->i_ino, io_end, 152 | + (unsigned long) size); 153 | + else 154 | + trace_printk("null io_end\n"); 155 | +#endif 156 | + 157 | /* if not async direct IO or dio with 0 bytes write, just return */ 158 | if (!io_end || !size) 159 | goto out; 160 | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c 161 | index d732ef5..d1b68b7 100644 162 | --- a/fs/ext4/mballoc.c 163 | +++ b/fs/ext4/mballoc.c 164 | @@ -861,8 +861,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore) 165 | } 166 | 167 | /* wait for I/O completion */ 168 | - for (i = 0; i < groups_per_page && bh[i]; i++) 169 | + for (i = 0; i < groups_per_page && bh[i]; i++) { 170 | +#if 1 /* PDEBUG */ 171 | + if (buffer_locked(bh[i])) 172 | + trace_printk("%s: wait on %lu\n", sb->s_id, 173 | + (unsigned long) bh[i]->b_blocknr); 174 | +#endif 175 | wait_on_buffer(bh[i]); 176 | + } 177 | 178 | err = -EIO; 179 | for (i = 0; i < groups_per_page && bh[i]; i++) 180 | diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c 181 | index ccce2c6..c2f42e0 100644 182 | --- a/fs/ext4/page-io.c 183 | +++ b/fs/ext4/page-io.c 184 | @@ -30,6 +30,8 @@ 185 | #include "acl.h" 186 | #include "ext4_extents.h" 187 | 188 | +#define PDEBUG 189 | + 190 | static struct kmem_cache *io_page_cachep, *io_end_cachep; 191 | 192 | int __init init_ext4_pageio(void) 193 | @@ -56,6 +58,9 @@ void ext4_free_io_end(ext4_io_end_t *io) 194 | { 195 | int i; 196 | 197 | +#ifdef PDEBUG 198 | + trace_printk("%p\n", io); 199 | +#endif 200 | BUG_ON(!io); 201 | if (io->page) 202 | put_page(io->page); 203 | @@ -63,6 +68,11 @@ void ext4_free_io_end(ext4_io_end_t *io) 204 | if (--io->pages[i]->p_count == 0) { 205 | struct page *page = io->pages[i]->p_page; 206 | 207 | +#ifdef PDEBUG 208 | + trace_printk("%s: end_page_writeback for %lu:%lu\n", 209 | + io->inode->i_sb->s_id, io->inode->i_ino, 210 | + (unsigned long) page->index); 211 | +#endif 212 | end_page_writeback(page); 213 | put_page(page); 214 | kmem_cache_free(io_page_cachep, io->pages[i]); 215 | @@ -121,6 +131,9 @@ static void ext4_end_io_work(struct work_struct *work) 216 | int ret; 217 | 218 | mutex_lock(&inode->i_mutex); 219 | +#ifdef PDEBUG 220 | + trace_printk("%p\n", io); 221 | +#endif 222 | ret = ext4_end_io_nolock(io); 223 | if (ret < 0) { 224 | mutex_unlock(&inode->i_mutex); 225 | @@ -147,6 +160,9 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 226 | INIT_WORK(&io->work, ext4_end_io_work); 227 | INIT_LIST_HEAD(&io->list); 228 | } 229 | +#ifdef PDEBUG 230 | + trace_printk("%p\n", io); 231 | +#endif 232 | return io; 233 | } 234 | 235 | @@ -175,6 +191,12 @@ static void ext4_end_bio(struct bio *bio, int error) 236 | 237 | BUG_ON(!io_end); 238 | inode = io_end->inode; 239 | +#ifdef PDEBUG 240 | + trace_printk("%s: enter: ino %lu offset %lu size %ld io_end=%p\n", 241 | + inode->i_sb->s_id, inode->i_ino, 242 | + (unsigned long) io_end->offset, 243 | + (long) io_end->size, io_end); 244 | +#endif 245 | bio->bi_private = NULL; 246 | bio->bi_end_io = NULL; 247 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 248 | @@ -235,6 +257,12 @@ static void ext4_end_bio(struct bio *bio, int error) 249 | if (--io_end->pages[i]->p_count == 0) { 250 | struct page *page = io_end->pages[i]->p_page; 251 | 252 | +#ifdef PDEBUG 253 | + trace_printk("%s: end_page_writeback for %lu:%lu\n", 254 | + io_end->inode->i_sb->s_id, 255 | + io_end->inode->i_ino, 256 | + (unsigned long) page->index); 257 | +#endif 258 | end_page_writeback(page); 259 | put_page(page); 260 | kmem_cache_free(io_page_cachep, io_end->pages[i]); 261 | @@ -261,12 +289,24 @@ static void ext4_end_bio(struct bio *bio, int error) 262 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 263 | /* queue the work to convert unwritten extents to written */ 264 | queue_work(wq, &io_end->work); 265 | +#ifdef PDEBUG 266 | + trace_printk("%s: exit: ino %lu\n", inode->i_sb->s_id, 267 | + io_end->inode->i_ino); 268 | +#endif 269 | } 270 | 271 | void ext4_io_submit(struct ext4_io_submit *io) 272 | { 273 | struct bio *bio = io->io_bio; 274 | 275 | +#ifdef PDEBUG 276 | + if (io->io_end) 277 | + trace_printk("%s: io submitted io_end %p\n", 278 | + io->io_end->inode->i_sb->s_id, io->io_end); 279 | + else 280 | + trace_printk("io submitted io_end %p\n", 281 | + io->io_end); 282 | +#endif 283 | if (bio) { 284 | bio_get(io->io_bio); 285 | submit_bio(io->io_op, io->io_bio); 286 | @@ -308,10 +348,14 @@ static int io_submit_init(struct ext4_io_submit *io, 287 | io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? 288 | WRITE_SYNC_PLUG : WRITE); 289 | io->io_next_block = bh->b_blocknr; 290 | +#ifdef PDEBUG 291 | + trace_printk("%s: io_submit_init for ino %lu, nvecs = %d\n", 292 | + inode->i_sb->s_id, inode->i_ino, nvecs); 293 | +#endif 294 | return 0; 295 | } 296 | 297 | -static int io_submit_add_bh(struct ext4_io_submit *io, 298 | +static noinline int io_submit_add_bh(struct ext4_io_submit *io, 299 | struct ext4_io_page *io_page, 300 | struct inode *inode, 301 | struct writeback_control *wbc, 302 | @@ -320,6 +364,14 @@ static int io_submit_add_bh(struct ext4_io_submit *io, 303 | ext4_io_end_t *io_end; 304 | int ret; 305 | 306 | +#ifdef PDEBUG 307 | + trace_printk("%s enter: ino %lu blk %lu %s%s%s%s\n", inode->i_sb->s_id, 308 | + inode->i_ino, (unsigned long) bh->b_blocknr, 309 | + buffer_new(bh) ? "N" : "", 310 | + buffer_mapped(bh) ? "M" : "", 311 | + buffer_delay(bh) ? "D" : "", 312 | + buffer_dirty(bh) ? "d" : ""); 313 | +#endif 314 | if (buffer_new(bh)) { 315 | clear_buffer_new(bh); 316 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 317 | @@ -351,13 +403,29 @@ submit_and_retry: 318 | io->io_end->size += bh->b_size; 319 | io->io_next_block++; 320 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 321 | - if (ret != bh->b_size) 322 | + if (ret != bh->b_size) { 323 | +#ifdef PDEBUG 324 | + trace_printk("%s: submit and retry (ret = %d, size=%d, " 325 | + "offset=%lu)\n", inode->i_sb->s_id, ret, 326 | + bh->b_size, bh_offset(bh)); 327 | +#endif 328 | goto submit_and_retry; 329 | + } 330 | if ((io_end->num_io_pages == 0) || 331 | (io_end->pages[io_end->num_io_pages-1] != io_page)) { 332 | io_end->pages[io_end->num_io_pages++] = io_page; 333 | io_page->p_count++; 334 | } 335 | +#ifdef PDEBUG 336 | + if (io->io_end) 337 | + trace_printk("%s: exit: ino %lu offset %lu size %ld\n", 338 | + inode->i_sb->s_id, inode->i_ino, 339 | + (unsigned long) io->io_end->offset, 340 | + (unsigned long) io->io_end->size); 341 | + else 342 | + trace_printk("%s: exit: ino %lu no_io_end\n", 343 | + inode->i_sb->s_id, inode->i_ino); 344 | +#endif 345 | return 0; 346 | } 347 | 348 | @@ -372,6 +440,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, 349 | struct buffer_head *bh, *head; 350 | int ret = 0; 351 | 352 | +#ifdef PDEBUG 353 | + trace_printk("%s: enter: ino %lu page %lu len %d\n", inode->i_sb->s_id, 354 | + inode->i_ino, page->index, len); 355 | +#endif 356 | blocksize = 1 << inode->i_blkbits; 357 | 358 | BUG_ON(PageWriteback(page)); 359 | @@ -422,5 +494,9 @@ int ext4_bio_write_page(struct ext4_io_submit *io, 360 | end_page_writeback(page); 361 | kmem_cache_free(io_page_cachep, io_page); 362 | } 363 | +#ifdef PDEBUG 364 | + trace_printk("%s: exit: for ino %lu\n", inode->i_sb->s_id, 365 | + inode->i_ino); 366 | +#endif 367 | return ret; 368 | } 369 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c 370 | index 6494c81..d3f8634 100644 371 | --- a/fs/jbd2/commit.c 372 | +++ b/fs/jbd2/commit.c 373 | @@ -631,6 +631,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) 374 | * (which is of type BJ_IO) 375 | */ 376 | JBUFFER_TRACE(jh, "ph3: write metadata"); 377 | +#if 1 /* PDEBUG */ 378 | + trace_printk("@635 %s block %llu\n", journal->j_devname, 379 | + blocknr); 380 | +#endif 381 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, 382 | jh, &new_jh, blocknr); 383 | if (flags < 0) { 384 | @@ -693,6 +697,11 @@ start_journal_io: 385 | clear_buffer_dirty(bh); 386 | set_buffer_uptodate(bh); 387 | bh->b_end_io = journal_end_buffer_io_sync; 388 | +#if 1 /* PDEBUG */ 389 | + trace_printk("@700 %s block %llu\n", 390 | + journal->j_devname, 391 | + bh->b_blocknr); 392 | +#endif 393 | submit_bh(write_op, bh); 394 | } 395 | cond_resched(); 396 | @@ -762,6 +771,10 @@ wait_for_iobuf: 397 | jh = commit_transaction->t_iobuf_list->b_tprev; 398 | bh = jh2bh(jh); 399 | if (buffer_locked(bh)) { 400 | +#if 1 /* PDEBUG */ 401 | + trace_printk("jbd wait_on_buffer@765: %lu\n", 402 | + (unsigned long) bh->b_blocknr); 403 | +#endif 404 | wait_on_buffer(bh); 405 | goto wait_for_iobuf; 406 | } 407 | @@ -818,6 +831,11 @@ wait_for_iobuf: 408 | jh = commit_transaction->t_log_list->b_tprev; 409 | bh = jh2bh(jh); 410 | if (buffer_locked(bh)) { 411 | +#if 1 /* PDEBUG */ 412 | + trace_printk("%s: jbd wait_on_buffer@823: %lu\n", 413 | + journal->j_devname, 414 | + (unsigned long) bh->b_blocknr); 415 | +#endif 416 | wait_on_buffer(bh); 417 | goto wait_for_ctlbuf; 418 | } 419 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c 420 | index 6bf0a24..8873caa 100644 421 | --- a/fs/jbd2/transaction.c 422 | +++ b/fs/jbd2/transaction.c 423 | @@ -701,6 +701,11 @@ repeat: 424 | for ( ; ; ) { 425 | prepare_to_wait(wqh, &wait.wait, 426 | TASK_UNINTERRUPTIBLE); 427 | +#if 1 /* PDEBUG */ 428 | + trace_printk("%s: BJ shadow waiting on %lu\n", 429 | + journal->j_devname, 430 | + (unsigned long) bh->b_blocknr); 431 | +#endif 432 | if (jh->b_jlist != BJ_Shadow) 433 | break; 434 | schedule(); 435 | diff --git a/mm/filemap.c b/mm/filemap.c 436 | index 3d4df44..e0c7061 100644 437 | --- a/mm/filemap.c 438 | +++ b/mm/filemap.c 439 | @@ -295,6 +295,13 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 440 | if (page->index > end) 441 | continue; 442 | 443 | +#if 1 /* PDEBUG */ 444 | + if (PageWriteback(page)) 445 | + trace_printk("pid %d waiting on %lu:%lu\n", 446 | + task_pid_nr(current), 447 | + mapping->host->i_ino, 448 | + (unsigned long) page->index); 449 | +#endif 450 | wait_on_page_writeback(page); 451 | if (PageError(page)) 452 | ret = -EIO; 453 | -------------------------------------------------------------------------------- /archive/introduce-new-i_write_mutex: -------------------------------------------------------------------------------- 1 | ext4: introduce new i_write_mutex to protect fallocate 2 | 3 | From: Namjae Jeon 4 | 5 | Introduce new i_write_mutex to protect new writes from coming while doing 6 | fallocate operations. Also, get rid of aio_mutex as it is covered by 7 | i_write_mutex. 8 | 9 | Signed-off-by: Namjae Jeon 10 | Signed-off-by: Ashish Sangwan 11 | Signed-off-by: Theodore Ts'o 12 | --- 13 | fs/ext4/ext4.h | 6 +++--- 14 | fs/ext4/extents.c | 19 +++++++++++++++---- 15 | fs/ext4/file.c | 23 +++++++++++++---------- 16 | fs/ext4/inode.c | 7 ++++++- 17 | fs/ext4/super.c | 3 +-- 18 | 5 files changed, 38 insertions(+), 20 deletions(-) 19 | 20 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 21 | index 1479e2a..0519715 100644 22 | --- a/fs/ext4/ext4.h 23 | +++ b/fs/ext4/ext4.h 24 | @@ -943,6 +943,9 @@ struct ext4_inode_info { 25 | 26 | /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ 27 | __u32 i_csum_seed; 28 | + 29 | + /* protects fallocate operations racing with new writes */ 30 | + struct mutex i_write_mutex; 31 | }; 32 | 33 | /* 34 | @@ -2805,10 +2808,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) 35 | #define EXT4_WQ_HASH_SZ 37 36 | #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ 37 | EXT4_WQ_HASH_SZ]) 38 | -#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ 39 | - EXT4_WQ_HASH_SZ]) 40 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 41 | -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; 42 | 43 | #define EXT4_RESIZING 0 44 | extern int ext4_resize_begin(struct super_block *sb); 45 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c 46 | index 5bbe4256..cb23a34 100644 47 | --- a/fs/ext4/extents.c 48 | +++ b/fs/ext4/extents.c 49 | @@ -4741,6 +4741,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, 50 | if (!S_ISREG(inode->i_mode)) 51 | return -EINVAL; 52 | 53 | + mutex_lock(&EXT4_I(inode)->i_write_mutex); 54 | + 55 | /* 56 | * Write out all dirty pages to avoid race conditions 57 | * Then release them. 58 | @@ -4748,8 +4750,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, 59 | if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 60 | ret = filemap_write_and_wait_range(mapping, offset, 61 | offset + len - 1); 62 | - if (ret) 63 | + if (ret) { 64 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 65 | return ret; 66 | + } 67 | } 68 | 69 | /* 70 | @@ -4761,8 +4765,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, 71 | start = round_up(offset, 1 << blkbits); 72 | end = round_down((offset + len), 1 << blkbits); 73 | 74 | - if (start < offset || end > offset + len) 75 | + if (start < offset || end > offset + len) { 76 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 77 | return -EINVAL; 78 | + } 79 | partial = (offset + len) & ((1 << blkbits) - 1); 80 | 81 | lblk = start >> blkbits; 82 | @@ -4859,6 +4865,7 @@ out_dio: 83 | ext4_inode_resume_unlocked_dio(inode); 84 | out_mutex: 85 | mutex_unlock(&inode->i_mutex); 86 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 87 | return ret; 88 | } 89 | 90 | @@ -5411,11 +5418,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) 91 | punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 92 | punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 93 | 94 | + mutex_lock(&EXT4_I(inode)->i_write_mutex); 95 | + 96 | /* Call ext4_force_commit to flush all data in case of data=journal. */ 97 | if (ext4_should_journal_data(inode)) { 98 | ret = ext4_force_commit(inode->i_sb); 99 | if (ret) 100 | - return ret; 101 | + goto out_i_write_mutex; 102 | } 103 | 104 | /* 105 | @@ -5428,7 +5437,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) 106 | ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 107 | LLONG_MAX); 108 | if (ret) 109 | - return ret; 110 | + goto out_i_write_mutex; 111 | 112 | /* Take mutex lock */ 113 | mutex_lock(&inode->i_mutex); 114 | @@ -5501,5 +5510,7 @@ out_dio: 115 | ext4_inode_resume_unlocked_dio(inode); 116 | out_mutex: 117 | mutex_unlock(&inode->i_mutex); 118 | +out_i_write_mutex: 119 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 120 | return ret; 121 | } 122 | diff --git a/fs/ext4/file.c b/fs/ext4/file.c 123 | index 4e8bc284..e5cd87f 100644 124 | --- a/fs/ext4/file.c 125 | +++ b/fs/ext4/file.c 126 | @@ -97,7 +97,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 127 | { 128 | struct file *file = iocb->ki_filp; 129 | struct inode *inode = file_inode(iocb->ki_filp); 130 | - struct mutex *aio_mutex = NULL; 131 | + bool unaligned_direct_aio = false; 132 | struct blk_plug plug; 133 | int o_direct = file->f_flags & O_DIRECT; 134 | int overwrite = 0; 135 | @@ -106,6 +106,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 136 | 137 | BUG_ON(iocb->ki_pos != pos); 138 | 139 | + mutex_lock(&EXT4_I(inode)->i_write_mutex); 140 | + 141 | /* 142 | * Unaligned direct AIO must be serialized; see comment above 143 | * In the case of O_APPEND, assume that we must always serialize 144 | @@ -115,8 +117,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 145 | !is_sync_kiocb(iocb) && 146 | (file->f_flags & O_APPEND || 147 | ext4_unaligned_aio(inode, iov, nr_segs, pos))) { 148 | - aio_mutex = ext4_aio_mutex(inode); 149 | - mutex_lock(aio_mutex); 150 | + unaligned_direct_aio = true; 151 | ext4_unwritten_wait(inode); 152 | } 153 | 154 | @@ -134,8 +135,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 155 | if ((pos > sbi->s_bitmap_maxbytes) || 156 | (pos == sbi->s_bitmap_maxbytes && length > 0)) { 157 | mutex_unlock(&inode->i_mutex); 158 | - ret = -EFBIG; 159 | - goto errout; 160 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 161 | + return -EFBIG; 162 | } 163 | 164 | if (pos + length > sbi->s_bitmap_maxbytes) { 165 | @@ -150,8 +151,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 166 | iocb->private = &overwrite; 167 | 168 | /* check whether we do a DIO overwrite or not */ 169 | - if (ext4_should_dioread_nolock(inode) && !aio_mutex && 170 | - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { 171 | + if (ext4_should_dioread_nolock(inode) && 172 | + !unaligned_direct_aio && !file->f_mapping->nrpages && 173 | + pos + length <= i_size_read(inode)) { 174 | struct ext4_map_blocks map; 175 | unsigned int blkbits = inode->i_blkbits; 176 | int err, len; 177 | @@ -181,6 +183,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 178 | 179 | ret = __generic_file_aio_write(iocb, iov, nr_segs); 180 | mutex_unlock(&inode->i_mutex); 181 | + if (!unaligned_direct_aio) 182 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 183 | 184 | if (ret > 0) { 185 | ssize_t err; 186 | @@ -192,9 +196,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 187 | if (o_direct) 188 | blk_finish_plug(&plug); 189 | 190 | -errout: 191 | - if (aio_mutex) 192 | - mutex_unlock(aio_mutex); 193 | + if (unaligned_direct_aio) 194 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 195 | return ret; 196 | } 197 | 198 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 199 | index 645de3e..55f999a 100644 200 | --- a/fs/ext4/inode.c 201 | +++ b/fs/ext4/inode.c 202 | @@ -3534,6 +3534,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) 203 | 204 | trace_ext4_punch_hole(inode, offset, length, 0); 205 | 206 | + mutex_lock(&EXT4_I(inode)->i_write_mutex); 207 | + 208 | /* 209 | * Write out all dirty pages to avoid race conditions 210 | * Then release them. 211 | @@ -3541,8 +3543,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) 212 | if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 213 | ret = filemap_write_and_wait_range(mapping, offset, 214 | offset + length - 1); 215 | - if (ret) 216 | + if (ret) { 217 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 218 | return ret; 219 | + } 220 | } 221 | 222 | mutex_lock(&inode->i_mutex); 223 | @@ -3643,6 +3647,7 @@ out_dio: 224 | ext4_inode_resume_unlocked_dio(inode); 225 | out_mutex: 226 | mutex_unlock(&inode->i_mutex); 227 | + mutex_unlock(&EXT4_I(inode)->i_write_mutex); 228 | return ret; 229 | } 230 | 231 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 232 | index b9b9aab..7667a5b 100644 233 | --- a/fs/ext4/super.c 234 | +++ b/fs/ext4/super.c 235 | @@ -904,6 +904,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) 236 | atomic_set(&ei->i_ioend_count, 0); 237 | atomic_set(&ei->i_unwritten, 0); 238 | INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 239 | + mutex_init(&ei->i_write_mutex); 240 | 241 | return &ei->vfs_inode; 242 | } 243 | @@ -5516,7 +5517,6 @@ static void ext4_exit_feat_adverts(void) 244 | 245 | /* Shared across all ext4 file systems */ 246 | wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 247 | -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; 248 | 249 | static int __init ext4_init_fs(void) 250 | { 251 | @@ -5529,7 +5529,6 @@ static int __init ext4_init_fs(void) 252 | ext4_check_flag_values(); 253 | 254 | for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { 255 | - mutex_init(&ext4__aio_mutex[i]); 256 | init_waitqueue_head(&ext4__ioend_wq[i]); 257 | } 258 | 259 | -------------------------------------------------------------------------------- /archive/jbd2-dont-write-non-commit-blocks-synchronously: -------------------------------------------------------------------------------- 1 | jbd2: don't write non-commit blocks synchronously 2 | 3 | We don't need to write the revoke blocks and descriptor blocks using 4 | WRITE_SYNC, since when we issue the commit block, thos blocks will get 5 | pushed out via REQ_FLUSH. This will allow the journal blocks to be 6 | written in fewer i/o operations (otherwise we end up issuing a whole 7 | series of 4k writes unnecessarily). 8 | 9 | Signed-off-by: "Theodore Ts'o" 10 | --- 11 | fs/jbd2/commit.c | 4 ++-- 12 | 1 file changed, 2 insertions(+), 2 deletions(-) 13 | 14 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c 15 | index cf2fc05..fb64629 100644 16 | --- a/fs/jbd2/commit.c 17 | +++ b/fs/jbd2/commit.c 18 | @@ -554,7 +554,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) 19 | 20 | blk_start_plug(&plug); 21 | jbd2_journal_write_revoke_records(journal, commit_transaction, 22 | - &log_bufs, WRITE_SYNC); 23 | + &log_bufs, WRITE); 24 | blk_finish_plug(&plug); 25 | 26 | jbd_debug(3, "JBD2: commit phase 2b\n"); 27 | @@ -739,7 +739,7 @@ start_journal_io: 28 | clear_buffer_dirty(bh); 29 | set_buffer_uptodate(bh); 30 | bh->b_end_io = journal_end_buffer_io_sync; 31 | - submit_bh(WRITE_SYNC, bh); 32 | + submit_bh(WRITE, bh); 33 | } 34 | cond_resched(); 35 | stats.run.rs_blocks_logged += bufs; 36 | -------------------------------------------------------------------------------- /cleaner: -------------------------------------------------------------------------------- 1 | Introduce cleaner 2 | 3 | From: Abutalib Aghayev 4 | 5 | An experimental cleaner. Copy the live blocks from the transaction at the 6 | tail in batches to the transaction at the head. After a commit ends, check 7 | if free space is below watermark and start cleaning until free space is 8 | above high watermark. 9 | 10 | Signed-off-by: Abutalib Aghayev 11 | Signed-off-by: Theodore Ts'o 12 | --- 13 | fs/jbd2/Makefile | 2 +- 14 | fs/jbd2/checkpoint.c | 3 + 15 | fs/jbd2/cleaner.c | 368 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 16 | fs/jbd2/jmap.c | 34 ++++++++ 17 | fs/jbd2/jmap.h | 77 +++++++++++++++++ 18 | fs/jbd2/journal.c | 23 +++++- 19 | include/linux/jbd2.h | 8 ++ 20 | 7 files changed, 512 insertions(+), 3 deletions(-) 21 | 22 | diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile 23 | index a54f50b3a06e..b6a2dddcc0a7 100644 24 | --- a/fs/jbd2/Makefile 25 | +++ b/fs/jbd2/Makefile 26 | @@ -5,4 +5,4 @@ 27 | obj-$(CONFIG_JBD2) += jbd2.o 28 | 29 | jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \ 30 | - jmap.o 31 | + jmap.o cleaner.o 32 | diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c 33 | index c125d662777c..b2468698f566 100644 34 | --- a/fs/jbd2/checkpoint.c 35 | +++ b/fs/jbd2/checkpoint.c 36 | @@ -386,6 +386,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) 37 | tid_t first_tid; 38 | unsigned long blocknr; 39 | 40 | + if (journal->j_flags & JBD2_LAZY) 41 | + return 0; 42 | + 43 | if (is_journal_aborted(journal)) 44 | return -EIO; 45 | 46 | diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c 47 | new file mode 100644 48 | index 000000000000..06ec11e1d2dd 49 | --- /dev/null 50 | +++ b/fs/jbd2/cleaner.c 51 | @@ -0,0 +1,368 @@ 52 | +#include 53 | +#include 54 | +#include "jmap.h" 55 | +#include 56 | +#include 57 | +#include 58 | +#include 59 | +#include 60 | + 61 | +static inline int jbd2_low_on_space(journal_t *journal) 62 | +{ 63 | + int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed); 64 | + if (x > 10) { 65 | + trace_jbd2_jmap_printf1("low on space", x); 66 | + return true; 67 | + } 68 | + trace_jbd2_jmap_printf1("not low on space", x); 69 | + return false; 70 | +} 71 | + 72 | +static inline int jbd2_high_on_space(journal_t *journal) 73 | +{ 74 | + if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) { 75 | + trace_jbd2_jmap_printf("not enough cleaned"); 76 | + return false; 77 | + } 78 | + trace_jbd2_jmap_printf("enough cleaned"); 79 | + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0); 80 | + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0); 81 | + return true; 82 | +} 83 | + 84 | +/* 85 | + * Tries to move the tail forward (hence free space) as long as the transaction 86 | + * at the tail has only stale blocks. Returns true if manages to free a 87 | + * transaction, false otherwise. 88 | + */ 89 | +static bool jbd2_try_to_move_tail(journal_t *journal) 90 | +{ 91 | + struct transaction_infos *tis = journal->j_transaction_infos; 92 | + struct transaction_info *ti, *ti1; 93 | + 94 | + /* 95 | + * Advance the tail as far as possible by skipping over transactions 96 | + * with no live blocks. 97 | + */ 98 | + write_lock(&journal->j_jmap_lock); 99 | + ti = ti1 = &tis->buf[tis->tail]; 100 | + 101 | + for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) { 102 | + trace_jbd2_jmap_printf2("cleaned a transaction", 103 | + tis->tail, ti->tid); 104 | + tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1); 105 | + atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned); 106 | + } 107 | + write_unlock(&journal->j_jmap_lock); 108 | + 109 | + if (ti == ti1) 110 | + return false; 111 | + /* 112 | + * In the worst case, this will end up updating the journal superblock 113 | + * after cleaning up every transaction. Should we avoid it? 114 | + */ 115 | + write_unlock(&journal->j_state_lock); 116 | + jbd2_update_log_tail(journal, ti->tid, ti->offset); 117 | + write_lock(&journal->j_state_lock); 118 | + 119 | + return true; 120 | +} 121 | + 122 | +/* 123 | + * Finds the live blocks at the tail transaction and copies the corresponding 124 | + * mappings to |ctx->mappings|. Returns the number of live block mappings 125 | + * copied. Should be called with a read lock on |j_jmap_lock|. 126 | + */ 127 | +static int find_live_blocks(struct cleaner_ctx *ctx) 128 | +{ 129 | + journal_t *journal = ctx->journal; 130 | + struct transaction_infos *tis = journal->j_transaction_infos; 131 | + struct transaction_info *ti = &tis->buf[tis->tail]; 132 | + struct jmap_entry *je = NULL; 133 | + int i, nr_live = 0; 134 | + 135 | + if (unlikely(list_empty(&ti->live_blks))) 136 | + goto done; 137 | + 138 | + spin_lock(&ctx->pos_lock); 139 | + if (!ctx->pos) 140 | + ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list); 141 | + je = ctx->pos; 142 | + spin_unlock(&ctx->pos_lock); 143 | + 144 | + list_for_each_entry_from(je, &ti->live_blks, list) { 145 | + if (je->revoked) 146 | + continue; 147 | + ctx->mappings[nr_live++] = je->mapping; 148 | + if (nr_live == CLEANER_BATCH_SIZE) 149 | + break; 150 | + } 151 | + 152 | +done: 153 | + trace_jbd2_jmap_printf1("found live blocks", nr_live); 154 | + for (i = 0; i < nr_live; ++i) 155 | + trace_jbd2_jmap_printf2("m", 156 | + ctx->mappings[i].fsblk, 157 | + ctx->mappings[i].logblk); 158 | + return nr_live; 159 | +} 160 | + 161 | +static void live_block_read_end_io(struct buffer_head *bh, int uptodate) 162 | +{ 163 | + struct cleaner_ctx *ctx = bh->b_private; 164 | + 165 | + if (uptodate) { 166 | + set_buffer_uptodate(bh); 167 | + if (atomic_dec_and_test(&ctx->nr_pending_reads)) 168 | + wake_up(&ctx->live_block_reads); 169 | + } else { 170 | + WARN_ON(1); 171 | + clear_buffer_uptodate(bh); 172 | + } 173 | + 174 | + unlock_buffer(bh); 175 | + put_bh(bh); 176 | +} 177 | + 178 | +/* 179 | + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into 180 | + * buffer heads in |ctx->bhs|. Returns true if at least one of the reads goes 181 | + * out to disk and false otherwise. If this function returns true then the 182 | + * client should sleep on the condition variable |ctx->live_block_reads|. The 183 | + * client will be woken up when all reads are complete, through the end_io 184 | + * handler attached to buffer heads read from disk. 185 | + */ 186 | +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live) 187 | +{ 188 | + journal_t *journal = ctx->journal; 189 | + bool slow = false; 190 | + struct blk_plug plug; 191 | + bool plugged = false; 192 | + int i, rc; 193 | + 194 | + for (i = 0; i < nr_live; ++i) { 195 | + ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk, 196 | + journal->j_blocksize); 197 | + if (unlikely(!ctx->bhs[i])) { 198 | + rc = -ENOMEM; 199 | + goto out_err; 200 | + } 201 | + if (buffer_uptodate(ctx->bhs[i])) 202 | + continue; 203 | + if (!plugged) { 204 | + plugged = true; 205 | + blk_start_plug(&plug); 206 | + } 207 | + lock_buffer(ctx->bhs[i]); 208 | + if (buffer_uptodate(ctx->bhs[i])) 209 | + continue; 210 | + ctx->bhs[i]->b_private = ctx; 211 | + ctx->bhs[i]->b_end_io = live_block_read_end_io; 212 | + get_bh(ctx->bhs[i]); 213 | + rc = read_block_from_log(ctx->journal, ctx->bhs[i], 214 | + REQ_RAHEAD, ctx->mappings[i].logblk); 215 | + if (unlikely(rc < 0)) 216 | + goto out_err; 217 | + atomic_inc(&ctx->nr_pending_reads); 218 | + if (rc) { 219 | + slow = true; 220 | + trace_jbd2_jmap_printf2("reading from disk", 221 | + ctx->mappings[i].fsblk, 222 | + ctx->mappings[i].logblk); 223 | + } else { 224 | + trace_jbd2_jmap_printf2("cached", 225 | + ctx->mappings[i].fsblk, 226 | + ctx->mappings[i].logblk); 227 | + } 228 | + } 229 | + if (plugged) 230 | + blk_finish_plug(&plug); 231 | + return slow; 232 | + 233 | +out_err: 234 | + if (plugged) 235 | + blk_finish_plug(&plug); 236 | + jbd2_journal_abort(ctx->journal, rc); 237 | + return false; 238 | +} 239 | + 240 | +/* 241 | + * This function finds the live blocks that became stale between the call to 242 | + * find_live_blocks and now, and discards them. It returns true if there are no 243 | + * more live blocks left at the tail transaction. 244 | + */ 245 | +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live) 246 | +{ 247 | + journal_t *journal = ctx->journal; 248 | + struct transaction_infos *tis = journal->j_transaction_infos; 249 | + struct transaction_info *ti = &tis->buf[tis->tail]; 250 | + struct jmap_entry *je = NULL; 251 | + int i = 0, j = 0, next = 0; 252 | + 253 | + trace_jbd2_jmap_printf(__func__); 254 | + spin_lock(&ctx->pos_lock); 255 | + BUG_ON(!ctx->pos); 256 | + je = ctx->pos; 257 | + list_for_each_entry_from(je, &ti->live_blks, list) { 258 | + for (j = next; j < nr_live; ++j) { 259 | + if (je->mapping.fsblk == ctx->mappings[j].fsblk) { 260 | + next = j+1; 261 | + ctx->pos = list_next_entry(je, list); 262 | + if (je->revoked) { 263 | + brelse(ctx->bhs[j]); 264 | + ctx->bhs[j] = NULL; 265 | + trace_jbd2_jmap_printf2( 266 | + "revoked", 267 | + ctx->mappings[i].fsblk, 268 | + ctx->mappings[i].logblk); 269 | + } 270 | + break; 271 | + } else { 272 | + trace_jbd2_jmap_printf2( 273 | + "moved to another list", 274 | + ctx->mappings[i].fsblk, 275 | + ctx->mappings[i].logblk); 276 | + brelse(ctx->bhs[j]); 277 | + ctx->bhs[j] = NULL; 278 | + } 279 | + } 280 | + if (++i == nr_live || j == nr_live) 281 | + break; 282 | + } 283 | + spin_unlock(&ctx->pos_lock); 284 | + 285 | + /* 286 | + * We have exited the loop. If we haven't processed all the entries in 287 | + * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have 288 | + * not processed |nr_live| entries from the live blocks list at the 289 | + * tail, that is if (i < nr_live) at the exit, then the live blocks list 290 | + * has shrunk and the tail transaction has no live blocks left. 291 | + */ 292 | + return j < nr_live && i < nr_live; 293 | +} 294 | + 295 | +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle, 296 | + int nr_live) 297 | +{ 298 | + int err, i; 299 | + 300 | + trace_jbd2_jmap_printf(__func__); 301 | + for (i = 0; i < nr_live; ++i) { 302 | + if (!ctx->bhs[i]) 303 | + continue; 304 | + trace_jbd2_jmap_printf2("attaching", 305 | + ctx->mappings[i].fsblk, 306 | + ctx->mappings[i].logblk); 307 | + err = jbd2_journal_get_write_access(handle, ctx->bhs[i]); 308 | + if (!err) 309 | + err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]); 310 | + if (err) { 311 | + jbd2_journal_abort(ctx->journal, err); 312 | + return; 313 | + } 314 | + } 315 | +} 316 | + 317 | +/* 318 | + * Read the live blocks from the tail transaction and attach them to the current 319 | + * transaction. 320 | + */ 321 | +void jbd2_jmap_do_clean_batch(struct work_struct *work) 322 | +{ 323 | + struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work); 324 | + journal_t *journal = ctx->journal; 325 | + bool wake_up_commit_thread = true; 326 | + handle_t *handle = NULL; 327 | + int nr_live, err; 328 | + 329 | + read_lock(&journal->j_jmap_lock); 330 | + nr_live = find_live_blocks(ctx); 331 | + read_unlock(&journal->j_jmap_lock); 332 | + 333 | + if (nr_live < CLEANER_BATCH_SIZE) 334 | + wake_up_commit_thread = false; 335 | + if (nr_live == 0) 336 | + goto done; 337 | + 338 | + read_live_blocks(ctx, nr_live); 339 | + wait_event(ctx->live_block_reads, 340 | + atomic_read(&ctx->nr_pending_reads) <= 0); 341 | + 342 | + handle = jbd2_journal_start(journal, nr_live); 343 | + if (IS_ERR(handle)) { 344 | + jbd2_journal_abort(journal, PTR_ERR(handle)); 345 | + return; 346 | + } 347 | + 348 | + read_lock(&journal->j_jmap_lock); 349 | + if (discard_stale_blocks(ctx, nr_live)) 350 | + wake_up_commit_thread = false; 351 | + read_unlock(&journal->j_jmap_lock); 352 | + /* 353 | + * I'm not sure why this function was under the jmap_lock 354 | + * previously, but it can't be, since it calls functions that 355 | + * can block due to memory allocation. I don't think it needs 356 | + * to be protected, since it appears that ctx->mapping is only 357 | + * used by the cleaner code, and so it can't be run multiple 358 | + * times. -- TYT 359 | + */ 360 | + attach_live_blocks(ctx, handle, nr_live); 361 | + 362 | + err = jbd2_journal_stop(handle); 363 | + if (err) { 364 | + jbd2_journal_abort(journal, err); 365 | + return; 366 | + } 367 | + 368 | +done: 369 | + atomic_set(&ctx->batch_in_progress, 0); 370 | + atomic_inc(&ctx->nr_txns_cleaned); 371 | + if (wake_up_commit_thread) { 372 | + trace_jbd2_jmap_printf("waking up commit thread"); 373 | + wake_up(&journal->j_wait_commit); 374 | + } else { 375 | + trace_jbd2_jmap_printf("not waking up commit thread"); 376 | + spin_lock(&ctx->pos_lock); 377 | + ctx->pos = NULL; 378 | + spin_unlock(&ctx->pos_lock); 379 | + } 380 | + write_lock(&journal->j_state_lock); 381 | + journal->j_flags &= ~JBD2_CLEANING; 382 | + write_unlock(&journal->j_state_lock); 383 | +} 384 | + 385 | +/* 386 | + * Called by the commit thread to see if we need to do any cleaning 387 | + * work. 388 | + * Called with j_state_lock write locked. 389 | + */ 390 | +void jbd2_check_cleaner(journal_t *journal) 391 | +{ 392 | + /* 393 | + * If there is cleaning going on in the workqueue, don't check 394 | + * until we're done. 395 | + */ 396 | + if (journal->j_flags & JBD2_CLEANING) 397 | + return; 398 | + 399 | + if (journal->j_flags & JBD2_STOP_CLEANING) { 400 | + disengage_cleaner: 401 | + journal->j_flags &= ~JBD2_CLEANER_ENGAGED; 402 | + return; 403 | + } 404 | + 405 | + if (journal->j_flags & JBD2_CLEANER_ENGAGED) { 406 | + if (jbd2_try_to_move_tail(journal) && 407 | + jbd2_high_on_space(journal)) 408 | + goto disengage_cleaner; 409 | + schedule_batch: 410 | + journal->j_flags |= JBD2_CLEANING; 411 | + schedule_work(&journal->j_cleaner_ctx->work); 412 | + return; 413 | + } 414 | + 415 | + if (jbd2_low_on_space(journal)) { 416 | + journal->j_flags |= JBD2_CLEANER_ENGAGED; 417 | + goto schedule_batch; 418 | + } 419 | +} 420 | diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c 421 | index 7de6f4a0a1dc..0e759cc095f5 100644 422 | --- a/fs/jbd2/jmap.c 423 | +++ b/fs/jbd2/jmap.c 424 | @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal, 425 | * We are either deleting the entry because it was revoked, or 426 | * we are moving it to the live blocks list of this transaction. 427 | * In either case, we remove it from its existing list. 428 | + * However, before removing it we check to see if this is an 429 | + * entry in the live blocks list of the tail transaction a 430 | + * pointer to whom is cached by the cleaner and update the 431 | + * cached pointer if so. 432 | */ 433 | + spin_lock(&journal->j_cleaner_ctx->pos_lock); 434 | + if (je == journal->j_cleaner_ctx->pos) { 435 | + journal->j_cleaner_ctx->pos = list_next_entry(je, list); 436 | + } 437 | list_del(&je->list); 438 | + spin_unlock(&journal->j_cleaner_ctx->pos_lock); 439 | 440 | if (je->revoked) { 441 | rb_erase(&je->rb_node, &journal->j_jmap); 442 | @@ -216,6 +225,8 @@ void jbd2_finish_transaction_infos(journal_t *journal) 443 | { 444 | struct transaction_infos *tis = journal->j_transaction_infos; 445 | 446 | + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed); 447 | + 448 | write_lock(&journal->j_jmap_lock); 449 | tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1); 450 | write_unlock(&journal->j_jmap_lock); 451 | @@ -243,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction, 452 | */ 453 | BUG_ON(!list_empty(&ti->live_blks)); 454 | 455 | + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed); 456 | + 457 | write_lock(&journal->j_jmap_lock); 458 | nr_new = process_existing_mappings(journal, ti, t_idx, mappings, 459 | nr_mappings); 460 | @@ -489,11 +502,32 @@ int jbd2_smr_journal_init(journal_t *journal) 461 | { 462 | journal->j_jmap = RB_ROOT; 463 | rwlock_init(&journal->j_jmap_lock); 464 | + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx), 465 | + GFP_KERNEL); 466 | + if (!journal->j_cleaner_ctx) 467 | + return -ENOMEM; 468 | + 469 | + journal->j_cleaner_ctx->journal = journal; 470 | + journal->j_cleaner_ctx->pos = NULL; 471 | + spin_lock_init(&journal->j_cleaner_ctx->pos_lock); 472 | + atomic_set(&journal->j_cleaner_ctx->cleaning, 0); 473 | + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0); 474 | + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0); 475 | + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0); 476 | + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0); 477 | + init_waitqueue_head(&journal->j_cleaner_ctx->live_block_reads); 478 | + INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch); 479 | return jbd2_init_transaction_infos(journal); 480 | } 481 | 482 | void jbd2_smr_journal_exit(journal_t *journal) 483 | { 484 | + if (journal->j_cleaner_ctx) { 485 | + atomic_set(&journal->j_cleaner_ctx->cleaning, 0); 486 | + flush_work(&journal->j_cleaner_ctx->work); 487 | + kfree(journal->j_cleaner_ctx); 488 | + journal->j_cleaner_ctx = NULL; 489 | + } 490 | jbd2_free_transaction_infos(journal); 491 | } 492 | 493 | diff --git a/fs/jbd2/jmap.h b/fs/jbd2/jmap.h 494 | index 91564ce9bbda..a44f15152536 100644 495 | --- a/fs/jbd2/jmap.h 496 | +++ b/fs/jbd2/jmap.h 497 | @@ -125,4 +125,81 @@ extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk); 498 | extern int read_block_from_log(journal_t *journal, struct buffer_head *bh, 499 | int op_flags, sector_t blk); 500 | 501 | +extern void jbd2_jmap_do_clean_batch(struct work_struct *work); 502 | + 503 | +/* 504 | + * Cleaner stuff is below. 505 | + */ 506 | + 507 | +/* 508 | + * Number of blocks to read at once, for cleaning. 509 | + */ 510 | +#define CLEANER_BATCH_SIZE 16 511 | + 512 | +/* 513 | + * Context structure for the cleaner. 514 | + */ 515 | +struct cleaner_ctx { 516 | + /* 517 | + * We set to true once we drop below low watermark and it stays so until 518 | + * we rise above the high watermark. It is accessed by the commit 519 | + * thread and the foreground kernel threads during the journal 520 | + * destruction, therefore it is atomic. 521 | + */ 522 | + atomic_t cleaning; 523 | + 524 | + /* 525 | + * We clean in batches of blocks. This flag indicates if we are 526 | + * currently cleaning a batch. It is accessed by the commit thread and 527 | + * the cleaner thread, therefore it is atomic. 528 | + */ 529 | + atomic_t batch_in_progress; 530 | + 531 | + /* 532 | + * We find live blocks to clean from the live blocks list of the 533 | + * transaction at the tail. This list can be larger than our batch size 534 | + * and we may need several attempts to process it. We cache the 535 | + * position of the next entry to start from in |pos|. Since cleaner 536 | + * thread can run concurrently with the commit thread that can modify 537 | + * the live blocks list of the transaction at the tail (for example, if 538 | + * it needs to drop a revoked entry or if |pos| points to an entry that 539 | + * has been updated and should move from the live blocks list of the 540 | + * transaction at the tail to the live blocks list of current 541 | + * transaction) we protect |pos| with |pos_lock|. 542 | + */ 543 | + struct jmap_entry *pos; 544 | + spinlock_t pos_lock; 545 | + 546 | + /* 547 | + * Live block mappings for the blocks that we copy in a batch. 548 | + */ 549 | + struct blk_mapping mappings[CLEANER_BATCH_SIZE]; 550 | + 551 | + /* 552 | + * Buffer heads for the live blocks read in a batch. 553 | + */ 554 | + struct buffer_head *bhs[CLEANER_BATCH_SIZE]; 555 | + 556 | + /* 557 | + * Number of pending reads in a batch. Every submitted read increments 558 | + * it and every completed read decrements it. 559 | + */ 560 | + atomic_t nr_pending_reads; 561 | + 562 | + /* 563 | + * The cleaner thread sleeps on this wait queue until the last 564 | + * completed read wakes the up the cleaner thread. 565 | + */ 566 | + wait_queue_head_t live_block_reads; 567 | + 568 | + /* TODO: temporary for debugging, remove once done. */ 569 | + atomic_t nr_txns_committed; 570 | + atomic_t nr_txns_cleaned; 571 | + 572 | + journal_t *journal; 573 | + struct work_struct work; 574 | +}; 575 | + 576 | +void jbd2_check_cleaner(journal_t *journal); 577 | + 578 | #endif 579 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c 580 | index 114c7636d706..5fdcaff927cf 100644 581 | --- a/fs/jbd2/journal.c 582 | +++ b/fs/jbd2/journal.c 583 | @@ -230,10 +230,16 @@ static int kjournald2(void *arg) 584 | del_timer_sync(&journal->j_commit_timer); 585 | jbd2_journal_commit_transaction(journal); 586 | write_lock(&journal->j_state_lock); 587 | - goto loop; 588 | } 589 | 590 | wake_up(&journal->j_wait_done_commit); 591 | + 592 | + if (journal->j_flags & JBD2_LAZY) 593 | + jbd2_check_cleaner(journal); 594 | + 595 | + if (journal->j_commit_sequence != journal->j_commit_request) 596 | + goto loop; 597 | + 598 | if (freezing(current)) { 599 | /* 600 | * The simpler the better. Flushing journal isn't a 601 | @@ -262,6 +268,9 @@ static int kjournald2(void *arg) 602 | should_sleep = 0; 603 | if (journal->j_flags & JBD2_UNMOUNT) 604 | should_sleep = 0; 605 | + if ((journal->j_flags & JBD2_CLEANER_ENGAGED) && 606 | + !(journal->j_flags & JBD2_CLEANING)) 607 | + should_sleep = 0; 608 | if (should_sleep) { 609 | write_unlock(&journal->j_state_lock); 610 | schedule(); 611 | @@ -307,14 +316,24 @@ static int jbd2_journal_start_thread(journal_t *journal) 612 | static void journal_kill_thread(journal_t *journal) 613 | { 614 | write_lock(&journal->j_state_lock); 615 | - journal->j_flags |= JBD2_UNMOUNT; 616 | 617 | + journal->j_flags |= JBD2_STOP_CLEANING; 618 | + while (journal->j_flags & JBD2_CLEANING) { 619 | + write_unlock(&journal->j_state_lock); 620 | + wake_up(&journal->j_wait_commit); 621 | + wait_event(journal->j_wait_done_commit, 622 | + (journal->j_flags & JBD2_CLEANING) == 0); 623 | + write_lock(&journal->j_state_lock); 624 | + } 625 | + 626 | + journal->j_flags |= JBD2_UNMOUNT; 627 | while (journal->j_task) { 628 | write_unlock(&journal->j_state_lock); 629 | wake_up(&journal->j_wait_commit); 630 | wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 631 | write_lock(&journal->j_state_lock); 632 | } 633 | + 634 | write_unlock(&journal->j_state_lock); 635 | } 636 | 637 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h 638 | index a53c7d333199..bb994983cdba 100644 639 | --- a/include/linux/jbd2.h 640 | +++ b/include/linux/jbd2.h 641 | @@ -786,6 +786,11 @@ struct journal_s 642 | */ 643 | rwlock_t j_jmap_lock; 644 | 645 | + /** 646 | + * @j_cleaner_ctx: Cleaner state 647 | + */ 648 | + struct cleaner_ctx *j_cleaner_ctx; 649 | + 650 | /** 651 | * @j_format_version: Version of the superblock format. 652 | */ 653 | @@ -1254,6 +1259,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) 654 | #define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */ 655 | #define JBD2_NO_CLEANUP 0x100 /* Don't flush empty the journal on shutdown */ 656 | #define JBD2_LAZY 0x200 /* Do lazy journalling */ 657 | +#define JBD2_CLEANING 0x400 /* Lazy journalling cleaning in progress */ 658 | +#define JBD2_CLEANER_ENGAGED 0x400 /* Cleaner has been engaged */ 659 | +#define JBD2_STOP_CLEANING 0x800 /* Request the cleaning thread to stop */ 660 | 661 | /* 662 | * Function declarations for the journaling transaction and buffer 663 | -------------------------------------------------------------------------------- /disable-writeback: -------------------------------------------------------------------------------- 1 | Disable writeback 2 | 3 | From: Abutalib Aghayev 4 | 5 | Now that we have a working cleaner, disable writeback of metadata blocks. 6 | 7 | Signed-off-by: Abutalib Aghayev 8 | Signed-off-by: Theodore Ts'o 9 | --- 10 | fs/jbd2/transaction.c | 5 ++++- 11 | include/linux/journal-head.h | 5 +++++ 12 | 2 files changed, 9 insertions(+), 1 deletion(-) 13 | 14 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c 15 | index 5e659ee08d6a..1bd1a1530fbc 100644 16 | --- a/fs/jbd2/transaction.c 17 | +++ b/fs/jbd2/transaction.c 18 | @@ -894,6 +894,8 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, 19 | jh->b_next_transaction == transaction) 20 | goto done; 21 | 22 | + jh->b_jflags = journal->j_flags; 23 | + 24 | /* 25 | * this is the first time this transaction is touching this buffer, 26 | * reset the modified flag 27 | @@ -1863,7 +1865,8 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 28 | 29 | __blist_del_buffer(list, jh); 30 | jh->b_jlist = BJ_None; 31 | - if (transaction && is_journal_aborted(transaction->t_journal)) 32 | + if ((transaction && is_journal_aborted(transaction->t_journal)) || 33 | + (jh->b_jflags & JBD2_LAZY)) 34 | clear_buffer_jbddirty(bh); 35 | else if (test_clear_buffer_jbddirty(bh)) 36 | mark_buffer_dirty(bh); /* Expose it to the VM */ 37 | diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h 38 | index 98cd41bb39c8..d4cce2bab7ff 100644 39 | --- a/include/linux/journal-head.h 40 | +++ b/include/linux/journal-head.h 41 | @@ -58,6 +58,11 @@ struct journal_head { 42 | char *b_committed_data; 43 | 44 | /* 45 | + * Copy of journal->j_flags 46 | + */ 47 | + unsigned b_jflags; 48 | + 49 | + /* 50 | * Pointer to the compound transaction which owns this buffer's 51 | * metadata: either the running transaction or the committing 52 | * transaction (if there is one). Only applies to buffers on a 53 | -------------------------------------------------------------------------------- /jbd2-dont-double-bump-transaction-number: -------------------------------------------------------------------------------- 1 | jbd2: don't skip a transaction number when recovering journal 2 | 3 | In the lazy journalling patches we retain the journal, so skipping a 4 | transaction after the replay is problematic. 5 | 6 | Signed-off-by: Theodore Ts'o 7 | --- 8 | fs/jbd2/recovery.c | 2 +- 9 | 1 file changed, 1 insertion(+), 1 deletion(-) 10 | 11 | diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c 12 | index 02dd3360cb20..da100044566c 100644 13 | --- a/fs/jbd2/recovery.c 14 | +++ b/fs/jbd2/recovery.c 15 | @@ -281,7 +281,7 @@ int jbd2_journal_recover(journal_t *journal) 16 | 17 | /* Restart the log at the next transaction ID, thus invalidating 18 | * any existing commit records in the log. */ 19 | - journal->j_transaction_sequence = ++info.end_transaction; 20 | + journal->j_transaction_sequence = info.end_transaction; 21 | 22 | jbd2_journal_clear_revoke(journal); 23 | err2 = sync_blockdev(journal->j_fs_dev); 24 | -------------------------------------------------------------------------------- /journal-superblock-changes: -------------------------------------------------------------------------------- 1 | ext4: journal superblock changes 2 | 3 | There are a number of changes to the ext4 superblock during the mount 4 | process which are done without using the journal, but instead via the 5 | brute-force call to ext4_commit_super(). Concentrate these changes to 6 | ext4_setup_super(), and make them using the journalling mechanism. 7 | 8 | Not only is this more efficient, but it also avoids some cases where 9 | the ext4 superblock's checksum was not properly set. 10 | 11 | Signed-off-by: Theodore Ts'o 12 | --- 13 | fs/ext4/super.c | 50 ++++++++++++++++++++++++++++---------------------- 14 | 1 file changed, 28 insertions(+), 22 deletions(-) 15 | 16 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 17 | index 680526e9ee96..ae86983cbf60 100644 18 | --- a/fs/ext4/super.c 19 | +++ b/fs/ext4/super.c 20 | @@ -2148,9 +2148,10 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) 21 | } 22 | 23 | static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 24 | - int read_only) 25 | + unsigned long journal_devnum, int read_only) 26 | { 27 | struct ext4_sb_info *sbi = EXT4_SB(sb); 28 | + handle_t *handle; 29 | int err = 0; 30 | 31 | if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { 32 | @@ -2158,7 +2159,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 33 | "forcing read-only mode"); 34 | err = -EROFS; 35 | } 36 | - if (read_only) 37 | + if (read_only || err) 38 | goto done; 39 | if (!(sbi->s_mount_state & EXT4_VALID_FS)) 40 | ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " 41 | @@ -2179,6 +2180,15 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 42 | ext4_msg(sb, KERN_WARNING, 43 | "warning: checktime reached, " 44 | "running e2fsck is recommended"); 45 | + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 46 | + if (IS_ERR(handle)) 47 | + return PTR_ERR(handle); 48 | + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 49 | + if (err) { 50 | + stop_journal: 51 | + ext4_journal_stop(handle); 52 | + return err; 53 | + } 54 | if (!sbi->s_journal) 55 | es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 56 | if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 57 | @@ -2188,7 +2198,17 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 58 | ext4_update_dynamic_rev(sb); 59 | if (sbi->s_journal) 60 | ext4_set_feature_journal_needs_recovery(sb); 61 | - 62 | + if (journal_devnum) 63 | + es->s_journal_dev = cpu_to_le32(journal_devnum); 64 | + if (DUMMY_ENCRYPTION_ENABLED(sbi)) 65 | + ext4_set_feature_encrypt(sb); 66 | + err = ext4_handle_dirty_super(handle, sb); 67 | + if (err) 68 | + goto stop_journal; 69 | + err = ext4_journal_stop(handle); 70 | + if (err) 71 | + return err; 72 | + ext4_journal_force_commit(sbi->s_journal); 73 | err = ext4_commit_super(sb, 1); 74 | done: 75 | if (test_opt(sb, DEBUG)) 76 | @@ -4229,8 +4249,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) 77 | 78 | set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 79 | 80 | - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; 81 | - 82 | no_journal: 83 | if (!test_opt(sb, NO_MBCACHE)) { 84 | sbi->s_ea_block_cache = ext4_xattr_create_cache(); 85 | @@ -4257,12 +4275,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) 86 | goto failed_mount_wq; 87 | } 88 | 89 | - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && 90 | - !ext4_has_feature_encrypt(sb)) { 91 | - ext4_set_feature_encrypt(sb); 92 | - ext4_commit_super(sb, 1); 93 | - } 94 | - 95 | /* 96 | * Get the # of file system overhead blocks from the 97 | * superblock if present. 98 | @@ -4311,7 +4323,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) 99 | goto failed_mount4; 100 | } 101 | 102 | - ret = ext4_setup_super(sb, es, sb_rdonly(sb)); 103 | + ret = ext4_setup_super(sb, es, journal_devnum, sb_rdonly(sb)); 104 | if (ret == -EROFS) { 105 | sb->s_flags |= SB_RDONLY; 106 | ret = 0; 107 | @@ -4410,6 +4422,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) 108 | } 109 | #endif /* CONFIG_QUOTA */ 110 | 111 | + if (sbi->s_journal) 112 | + sbi->s_journal->j_commit_callback = 113 | + ext4_journal_commit_callback; 114 | EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 115 | ext4_orphan_cleanup(sb, es); 116 | EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 117 | @@ -4780,15 +4795,6 @@ static int ext4_load_journal(struct super_block *sb, 118 | 119 | EXT4_SB(sb)->s_journal = journal; 120 | ext4_clear_journal_err(sb, es); 121 | - 122 | - if (!really_read_only && journal_devnum && 123 | - journal_devnum != le32_to_cpu(es->s_journal_dev)) { 124 | - es->s_journal_dev = cpu_to_le32(journal_devnum); 125 | - 126 | - /* Make sure we flush the recovery flag to disk. */ 127 | - ext4_commit_super(sb, 1); 128 | - } 129 | - 130 | return 0; 131 | } 132 | 133 | @@ -5263,7 +5269,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) 134 | ext4_clear_journal_err(sb, es); 135 | sbi->s_mount_state = le16_to_cpu(es->s_state); 136 | 137 | - err = ext4_setup_super(sb, es, 0); 138 | + err = ext4_setup_super(sb, es, 0, 0); 139 | if (err) 140 | goto restore_opts; 141 | 142 | -------------------------------------------------------------------------------- /load-jmap-from-journal: -------------------------------------------------------------------------------- 1 | jbd2: load jmap from journal 2 | 3 | If the lazy journal feature is enabled, instead of replaying the 4 | journal, read the journal into journal map. 5 | 6 | Signed-off-by: Theodore Ts'o 7 | --- 8 | fs/jbd2/journal.c | 27 +++++++++--------------- 9 | fs/jbd2/recovery.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- 10 | 2 files changed, 90 insertions(+), 42 deletions(-) 11 | 12 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c 13 | index 9c097ddfe63f..8060ab4805eb 100644 14 | --- a/fs/jbd2/journal.c 15 | +++ b/fs/jbd2/journal.c 16 | @@ -1276,31 +1276,24 @@ static void journal_fail_superblock (journal_t *journal) 17 | 18 | /* 19 | * Given a journal_t structure, initialise the various fields for 20 | - * startup of a new journaling session. We use this both when creating 21 | - * a journal, and after recovering an old journal to reset it for 22 | - * subsequent use. 23 | + * startup of a new journaling session. 24 | */ 25 | - 26 | static int journal_reset(journal_t *journal) 27 | { 28 | journal_superblock_t *sb = journal->j_superblock; 29 | - unsigned long long first, last; 30 | + int free; 31 | 32 | - first = be32_to_cpu(sb->s_first); 33 | - last = be32_to_cpu(sb->s_maxlen); 34 | - if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { 35 | - printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", 36 | - first, last); 37 | + if (journal->j_first + JBD2_MIN_JOURNAL_BLOCKS > journal->j_last + 1) { 38 | + printk(KERN_ERR "JBD2: Journal too short (blocks %lu-%lu).\n", 39 | + journal->j_first, journal->j_last); 40 | journal_fail_superblock(journal); 41 | return -EINVAL; 42 | } 43 | 44 | - journal->j_first = first; 45 | - journal->j_last = last; 46 | - 47 | - journal->j_head = first; 48 | - journal->j_tail = first; 49 | - journal->j_free = last - first; 50 | + free = journal->j_tail - journal->j_head; 51 | + if (free <= 0) 52 | + free += journal->j_last - journal->j_first; 53 | + journal->j_free = free; 54 | 55 | journal->j_tail_sequence = journal->j_transaction_sequence; 56 | journal->j_commit_sequence = journal->j_transaction_sequence - 1; 57 | @@ -1320,7 +1313,7 @@ static int journal_reset(journal_t *journal) 58 | journal->j_tail, journal->j_tail_sequence, 59 | journal->j_errno); 60 | journal->j_flags |= JBD2_FLUSHED; 61 | - } else { 62 | + } else if ((journal->j_flags & JBD2_LAZY) == 0) { 63 | /* Lock here to make assertions happy... */ 64 | mutex_lock_io(&journal->j_checkpoint_mutex); 65 | /* 66 | diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c 67 | index da100044566c..7a74ea1860a9 100644 68 | --- a/fs/jbd2/recovery.c 69 | +++ b/fs/jbd2/recovery.c 70 | @@ -22,6 +22,7 @@ 71 | #include 72 | #include 73 | #include 74 | +#include "jmap.h" 75 | #endif 76 | 77 | /* 78 | @@ -32,17 +33,18 @@ struct recovery_info 79 | { 80 | tid_t start_transaction; 81 | tid_t end_transaction; 82 | + int head_block; 83 | 84 | int nr_replays; 85 | int nr_revokes; 86 | int nr_revoke_hits; 87 | }; 88 | 89 | -enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; 90 | +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY, PASS_JMAP}; 91 | static int do_one_pass(journal_t *journal, 92 | struct recovery_info *info, enum passtype pass); 93 | -static int scan_revoke_records(journal_t *, struct buffer_head *, 94 | - tid_t, struct recovery_info *); 95 | +static int scan_revoke_records(journal_t *, struct buffer_head *, enum passtype, 96 | + tid_t, struct recovery_info *); 97 | 98 | #ifdef __KERNEL__ 99 | 100 | @@ -255,11 +257,16 @@ int jbd2_journal_recover(journal_t *journal) 101 | sb = journal->j_superblock; 102 | 103 | /* 104 | + * Initialize journal's head and tail assuming the recovery 105 | + * was successful and we're not doing lazy journalling. 106 | + */ 107 | + journal->j_head = journal->j_tail = journal->j_first; 108 | + 109 | + /* 110 | * The journal superblock's s_start field (the current log head) 111 | * is always zero if, and only if, the journal was cleanly 112 | * unmounted. 113 | */ 114 | - 115 | if (!sb->s_start) { 116 | jbd_debug(1, "No recovery required, last transaction %d\n", 117 | be32_to_cpu(sb->s_sequence)); 118 | @@ -267,11 +274,15 @@ int jbd2_journal_recover(journal_t *journal) 119 | return 0; 120 | } 121 | 122 | - err = do_one_pass(journal, &info, PASS_SCAN); 123 | - if (!err) 124 | - err = do_one_pass(journal, &info, PASS_REVOKE); 125 | - if (!err) 126 | - err = do_one_pass(journal, &info, PASS_REPLAY); 127 | + if (journal->j_flags & JBD2_LAZY) 128 | + err = do_one_pass(journal, &info, PASS_JMAP); 129 | + else { 130 | + err = do_one_pass(journal, &info, PASS_SCAN); 131 | + if (!err) 132 | + err = do_one_pass(journal, &info, PASS_REVOKE); 133 | + if (!err) 134 | + err = do_one_pass(journal, &info, PASS_REPLAY); 135 | + } 136 | 137 | jbd_debug(1, "JBD2: recovery, exit status %d, " 138 | "recovered transactions %u to %u\n", 139 | @@ -279,10 +290,22 @@ int jbd2_journal_recover(journal_t *journal) 140 | jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", 141 | info.nr_replays, info.nr_revoke_hits, info.nr_revokes); 142 | 143 | - /* Restart the log at the next transaction ID, thus invalidating 144 | - * any existing commit records in the log. */ 145 | + /* Restart the log at the next transaction ID */ 146 | journal->j_transaction_sequence = info.end_transaction; 147 | 148 | + /* 149 | + * In lazy journalling mode, we need to preserve the existing 150 | + * contents of the journal, so set j_head and j_tail 151 | + * accordingly. 152 | + */ 153 | + if (journal->j_flags & JBD2_LAZY) { 154 | + if (err) 155 | + return err; 156 | + journal->j_head = info.head_block; 157 | + journal->j_tail = be32_to_cpu(sb->s_start); 158 | + return 0; 159 | + } 160 | + 161 | jbd2_journal_clear_revoke(journal); 162 | err2 = sync_blockdev(journal->j_fs_dev); 163 | if (!err) 164 | @@ -431,6 +454,7 @@ static int do_one_pass(journal_t *journal, 165 | __u32 crc32_sum = ~0; /* Transactional Checksums */ 166 | int descr_csum_size = 0; 167 | int block_error = 0; 168 | + int new_txn = 1; 169 | 170 | /* 171 | * First thing is to establish what we expect to find in the log 172 | @@ -443,7 +467,7 @@ static int do_one_pass(journal_t *journal, 173 | next_log_block = be32_to_cpu(sb->s_start); 174 | 175 | first_commit_ID = next_commit_ID; 176 | - if (pass == PASS_SCAN) 177 | + if (pass == PASS_SCAN || pass == PASS_JMAP) 178 | info->start_transaction = first_commit_ID; 179 | 180 | jbd_debug(1, "Starting recovery pass %d\n", pass); 181 | @@ -468,7 +492,7 @@ static int do_one_pass(journal_t *journal, 182 | * check right now that we haven't gone past the end of 183 | * the log. */ 184 | 185 | - if (pass != PASS_SCAN) 186 | + if (pass != PASS_SCAN && pass != PASS_JMAP) 187 | if (tid_geq(next_commit_ID, info->end_transaction)) 188 | break; 189 | 190 | @@ -484,9 +508,6 @@ static int do_one_pass(journal_t *journal, 191 | if (err) 192 | goto failed; 193 | 194 | - next_log_block++; 195 | - wrap(journal, next_log_block); 196 | - 197 | /* What kind of buffer is it? 198 | * 199 | * If it is a descriptor block, check that it has the 200 | @@ -510,6 +531,14 @@ static int do_one_pass(journal_t *journal, 201 | break; 202 | } 203 | 204 | + if ((pass == PASS_JMAP) && new_txn) { 205 | + jbd2_add_new_transaction_infos(journal, sequence, next_log_block); 206 | + new_txn = 0; 207 | + } 208 | + 209 | + next_log_block++; 210 | + wrap(journal, next_log_block); 211 | + 212 | /* OK, we have a valid descriptor block which matches 213 | * all of the sequence number checks. What are we going 214 | * to do with it? That depends on the pass... */ 215 | @@ -535,7 +564,7 @@ static int do_one_pass(journal_t *journal, 216 | * in pass REPLAY; if journal_checksums enabled, then 217 | * calculate checksums in PASS_SCAN, otherwise, 218 | * just skip over the blocks it describes. */ 219 | - if (pass != PASS_REPLAY) { 220 | + if ((pass != PASS_REPLAY) && (pass != PASS_JMAP)) { 221 | if (pass == PASS_SCAN && 222 | jbd2_has_feature_checksum(journal) && 223 | !info->end_transaction) { 224 | @@ -562,12 +591,28 @@ static int do_one_pass(journal_t *journal, 225 | while ((tagp - bh->b_data + tag_bytes) 226 | <= journal->j_blocksize - descr_csum_size) { 227 | unsigned long io_block; 228 | + unsigned long long log_block; 229 | 230 | tag = (journal_block_tag_t *) tagp; 231 | flags = be16_to_cpu(tag->t_flags); 232 | 233 | io_block = next_log_block++; 234 | wrap(journal, next_log_block); 235 | + if (pass == PASS_JMAP) { 236 | + struct blk_mapping map; 237 | + 238 | + err = jbd2_journal_bmap(journal, 239 | + io_block, 240 | + &log_block); 241 | + if (err) 242 | + goto failed; 243 | + map.fsblk = read_tag_block(journal, tag); 244 | + map.logblk = log_block; 245 | + err = jbd2_add_mapping(journal, &map); 246 | + if (err) 247 | + goto failed; 248 | + goto skip_write; 249 | + } 250 | err = jread(&obh, journal, io_block); 251 | if (err) { 252 | /* Recover what we can, but 253 | @@ -753,6 +798,10 @@ static int do_one_pass(journal_t *journal, 254 | break; 255 | } 256 | } 257 | + if (pass == PASS_JMAP) { 258 | + jbd2_finish_transaction_infos(journal); 259 | + new_txn = 1; 260 | + } 261 | brelse(bh); 262 | next_commit_ID++; 263 | continue; 264 | @@ -760,12 +809,12 @@ static int do_one_pass(journal_t *journal, 265 | case JBD2_REVOKE_BLOCK: 266 | /* If we aren't in the REVOKE pass, then we can 267 | * just skip over this block. */ 268 | - if (pass != PASS_REVOKE) { 269 | + if (pass != PASS_REVOKE && pass != PASS_JMAP) { 270 | brelse(bh); 271 | continue; 272 | } 273 | 274 | - err = scan_revoke_records(journal, bh, 275 | + err = scan_revoke_records(journal, bh, pass, 276 | next_commit_ID, info); 277 | brelse(bh); 278 | if (err) 279 | @@ -788,9 +837,10 @@ static int do_one_pass(journal_t *journal, 280 | * transaction marks the end of the valid log. 281 | */ 282 | 283 | - if (pass == PASS_SCAN) { 284 | + if (pass == PASS_SCAN || pass == PASS_JMAP) { 285 | if (!info->end_transaction) 286 | info->end_transaction = next_commit_ID; 287 | + info->head_block = next_log_block; 288 | } else { 289 | /* It's really bad news if different passes end up at 290 | * different places (but possible due to IO errors). */ 291 | @@ -813,7 +863,8 @@ static int do_one_pass(journal_t *journal, 292 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ 293 | 294 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 295 | - tid_t sequence, struct recovery_info *info) 296 | + enum passtype pass, tid_t sequence, 297 | + struct recovery_info *info) 298 | { 299 | jbd2_journal_revoke_header_t *header; 300 | int offset, max; 301 | @@ -839,16 +890,20 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 302 | 303 | while (offset + record_len <= max) { 304 | unsigned long long blocknr; 305 | - int err; 306 | 307 | if (record_len == 4) 308 | blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); 309 | else 310 | blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); 311 | offset += record_len; 312 | - err = jbd2_journal_set_revoke(journal, blocknr, sequence); 313 | - if (err) 314 | - return err; 315 | + if (pass == PASS_JMAP) 316 | + jbd2_jmap_revoke(journal, blocknr); 317 | + else { 318 | + int err = jbd2_journal_set_revoke(journal, blocknr, 319 | + sequence); 320 | + if (err) 321 | + return err; 322 | + } 323 | ++info->nr_revokes; 324 | } 325 | return 0; 326 | -------------------------------------------------------------------------------- /old-patches/add-blkdiscard-ioctl: -------------------------------------------------------------------------------- 1 | ext4: add BLKDISCARD ioctl 2 | 3 | The blkdicard ioctl previously only worked on block devices. Allow 4 | this ioctl to work on ext4 files. 5 | 6 | Google-Bug-Id: 11517631 7 | 8 | Signed-off-by: "Theodore Ts'o" 9 | --- 10 | fs/ext4/ext4.h | 5 ++++ 11 | fs/ext4/extents.c | 38 +++++++++++++++++++------ 12 | fs/ext4/ioctl.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 13 | 3 files changed, 172 insertions(+), 9 deletions(-) 14 | 15 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 16 | index 122cc74..68d88c7 100644 17 | --- a/fs/ext4/ext4.h 18 | +++ b/fs/ext4/ext4.h 19 | @@ -3208,6 +3208,8 @@ extern int ext4_check_blockref(const char *, unsigned int, 20 | /* extents.c */ 21 | struct ext4_ext_path; 22 | struct ext4_extent; 23 | +typedef int (*extent_iterator_t)(struct inode *inode, struct extent_status *es, 24 | + unsigned int flags, void *private); 25 | 26 | /* 27 | * Maximum number of logical blocks in a file; ext4_extent's ee_block is 28 | @@ -3252,6 +3254,9 @@ extern int ext4_find_delalloc_range(struct inode *inode, 29 | ext4_lblk_t lblk_end); 30 | extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 31 | extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); 32 | +extern int ext4_extent_iterator(struct inode *inode, 33 | + ext4_lblk_t block, ext4_lblk_t num, 34 | + extent_iterator_t callback, void *private); 35 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 36 | __u64 start, __u64 len); 37 | extern int ext4_ext_precache(struct inode *inode); 38 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c 39 | index 572fb4346..c32f0c1 100644 40 | --- a/fs/ext4/extents.c 41 | +++ b/fs/ext4/extents.c 42 | @@ -2150,9 +2150,13 @@ cleanup: 43 | return err; 44 | } 45 | 46 | -static int ext4_fill_fiemap_extents(struct inode *inode, 47 | - ext4_lblk_t block, ext4_lblk_t num, 48 | - struct fiemap_extent_info *fieinfo) 49 | + 50 | +typedef int (*extent_iterator_t)(struct inode *inode, struct extent_status *es, 51 | + unsigned int flags, void *private); 52 | + 53 | +int ext4_extent_iterator(struct inode *inode, 54 | + ext4_lblk_t block, ext4_lblk_t num, 55 | + extent_iterator_t callback, void *private) 56 | { 57 | struct ext4_ext_path *path = NULL; 58 | struct ext4_extent *ex; 59 | @@ -2161,7 +2165,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, 60 | ext4_lblk_t last = block + num; 61 | int exists, depth = 0, err = 0; 62 | unsigned int flags = 0; 63 | - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 64 | 65 | while (block < last && block != EXT_MAX_BLOCKS) { 66 | num = last - block; 67 | @@ -2278,11 +2281,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, 68 | } 69 | 70 | if (exists) { 71 | - err = fiemap_fill_next_extent(fieinfo, 72 | - (__u64)es.es_lblk << blksize_bits, 73 | - (__u64)es.es_pblk << blksize_bits, 74 | - (__u64)es.es_len << blksize_bits, 75 | - flags); 76 | + err = callback(inode, &es, flags, private); 77 | if (err < 0) 78 | break; 79 | if (err == 1) { 80 | @@ -2341,6 +2340,27 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, 81 | return len; 82 | } 83 | 84 | +static int call_fill_fiemap(struct inode *inode, struct extent_status *es, 85 | + unsigned int flags, void *private) 86 | +{ 87 | + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 88 | + 89 | + return fiemap_fill_next_extent(private, 90 | + (__u64)es->es_lblk << blksize_bits, 91 | + (__u64)es->es_pblk << blksize_bits, 92 | + (__u64)es->es_len << blksize_bits, 93 | + flags); 94 | +} 95 | + 96 | +static int ext4_fill_fiemap_extents(struct inode *inode, 97 | + ext4_lblk_t block, ext4_lblk_t num, 98 | + struct fiemap_extent_info *fieinfo) 99 | +{ 100 | + return ext4_extent_iterator(inode, block, num, 101 | + call_fill_fiemap, fieinfo); 102 | +} 103 | + 104 | + 105 | /* 106 | * ext4_ext_put_gap_in_cache: 107 | * calculate boundaries of the gap that the requested block fits into 108 | diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c 109 | index 6c6be1d..fa8bac0 100644 110 | --- a/fs/ext4/ioctl.c 111 | +++ b/fs/ext4/ioctl.c 112 | @@ -468,6 +468,132 @@ static int write_user_mdata(unsigned long arg, 113 | } 114 | #endif 115 | 116 | +static int discard_callback(struct inode *inode, struct extent_status *es, 117 | + unsigned int flags, void *private) 118 | +{ 119 | + struct ext4_map_blocks *map = private; 120 | + ext4_lblk_t es_lblk = es->es_lblk; 121 | + ext4_lblk_t es_len = es->es_len; 122 | + ext4_fsblk_t es_pblk = es->es_pblk; 123 | + 124 | + if (flags & (FIEMAP_EXTENT_UNKNOWN | 125 | + FIEMAP_EXTENT_ENCODED | 126 | + FIEMAP_EXTENT_DATA_ENCRYPTED | 127 | + FIEMAP_EXTENT_DELALLOC | 128 | + FIEMAP_EXTENT_DATA_TAIL | 129 | + FIEMAP_EXTENT_DATA_INLINE | 130 | + FIEMAP_EXTENT_NOT_ALIGNED | 131 | + FIEMAP_EXTENT_SHARED)) 132 | + return 0; 133 | + 134 | + if (es_lblk < map->m_lblk) { 135 | + ext4_lblk_t d = map->m_lblk - es_lblk; 136 | + if (d > es_len) 137 | + return 0; 138 | + es_lblk += d; 139 | + es_pblk += d; 140 | + es_len -= d; 141 | + } 142 | + 143 | + if (es_lblk + es_len > map->m_lblk + map->m_len) 144 | + es_len -= es_lblk + es_len - (map->m_lblk + map->m_len); 145 | +#ifdef BLKDISCARD_DEBUG 146 | + ext4_msg(inode->i_sb, KERN_NOTICE, "discard: %llu len %lu", 147 | + (unsigned long long) es_pblk, (unsigned long) es_len); 148 | + return 0; 149 | +#else 150 | + return sb_issue_discard(inode->i_sb, es_pblk, es_len, GFP_KERNEL, 0); 151 | +#endif 152 | +} 153 | + 154 | +static int blkdiscard_inode(struct inode *inode, u64 start_offset, u64 len) 155 | +{ 156 | + struct super_block *sb = inode->i_sb; 157 | + struct ext4_map_blocks map; 158 | + unsigned int num; 159 | + 160 | + if (!S_ISREG(inode->i_mode)) 161 | + return -EINVAL; 162 | + 163 | + if (!blk_queue_discard(bdev_get_queue(sb->s_bdev))) 164 | + return -EOPNOTSUPP; 165 | + 166 | + if (!bdev_discard_zeroes_data(sb->s_bdev) && !capable(CAP_SYS_ADMIN)) 167 | + return -EOPNOTSUPP; 168 | + 169 | + num = start_offset & (sb->s_blocksize - 1); 170 | + if (num) { 171 | + num = sb->s_blocksize - num; 172 | + start_offset += num; 173 | + len = (len > num) ? len - num : 0; 174 | + } 175 | + if (len == 0) 176 | + return 0; 177 | + if (start_offset > sb->s_maxbytes) 178 | + return -EFBIG; 179 | + if (len > sb->s_maxbytes || (sb->s_maxbytes - len) < start_offset) 180 | + len = sb->s_maxbytes - start_offset; 181 | + 182 | + map.m_lblk = start_offset >> sb->s_blocksize_bits; 183 | + map.m_len = len >> sb->s_blocksize_bits; 184 | + 185 | +#ifdef BLKDISCARD_DEBUG 186 | + ext4_msg(sb, KERN_NOTICE, "blkdiscard range: %lu len %lu", 187 | + (unsigned long) map.m_lblk, (unsigned long) map.m_len); 188 | +#endif 189 | + 190 | + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 191 | + return ext4_extent_iterator(inode, map.m_lblk, map.m_len, 192 | + discard_callback, &map); 193 | + 194 | + num = map.m_len; 195 | + while (num) { 196 | + int ret = ext4_map_blocks(NULL, inode, &map, 0); 197 | + 198 | + if (ret < 0) 199 | + return ret; 200 | + 201 | + if (ret == 0) { 202 | +#ifdef BLKDISCARD_DEBUG 203 | + ext4_msg(sb, KERN_NOTICE, 204 | + "skip: lblk %lu len %lu ret %lu num %lu", 205 | + (unsigned long) map.m_lblk, 206 | + (unsigned long) map.m_len, 207 | + (unsigned long) ret, 208 | + (unsigned long) num); 209 | +#endif 210 | + map.m_lblk++; 211 | + num--; 212 | + continue; 213 | + } 214 | +#ifdef BLKDISCARD_DEBUG 215 | + ext4_msg(sb, KERN_NOTICE, 216 | + "walk: lblk %lu pblk %llu len %lu ret %lu num %lu", 217 | + (unsigned long) map.m_lblk, 218 | + (unsigned long long) map.m_pblk, 219 | + (unsigned long) map.m_len, 220 | + (unsigned long) ret, 221 | + (unsigned long) num); 222 | +#endif 223 | + if (ret > num) 224 | + ret = num; 225 | + map.m_lblk += ret; 226 | + num -= ret; 227 | + map.m_len = num; 228 | + 229 | +#ifdef BLKDISCARD_DEBUG 230 | + ext4_msg(sb, KERN_NOTICE, "discard: %llu len %lu", 231 | + (unsigned long long) map.m_pblk, (unsigned long) ret); 232 | +#else 233 | + ret = sb_issue_discard(sb, map.m_pblk, ret, 234 | + GFP_KERNEL, 0); 235 | + if (ret) 236 | + return ret; 237 | +#endif 238 | + } 239 | + return 0; 240 | +} 241 | + 242 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 243 | { 244 | struct inode *inode = file_inode(filp); 245 | @@ -1006,6 +1132,17 @@ encryption_policy_out: 246 | return -EOPNOTSUPP; 247 | #endif 248 | } 249 | + case BLKDISCARD: { 250 | + uint64_t range[2]; 251 | + 252 | + if (!(filp->f_mode & FMODE_WRITE)) 253 | + return -EBADF; 254 | + 255 | + if (copy_from_user(range, (void __user *)arg, sizeof(range))) 256 | + return -EFAULT; 257 | + 258 | + return blkdiscard_inode(file_inode(filp), range[0], range[1]); 259 | + } 260 | default: 261 | return -ENOTTY; 262 | } 263 | @@ -1075,6 +1212,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 264 | case EXT4_IOC_GET_ENCRYPTION_METADATA: 265 | case EXT4_IOC_SET_ENCRYPTION_METADATA: 266 | case EXT4_IOC_GET_ENCRYPTED_FILENAME: 267 | + case BLKDISCARD: 268 | break; 269 | default: 270 | return -ENOIOCTLCMD; 271 | -------------------------------------------------------------------------------- /old-patches/add-encryption-debug-files: -------------------------------------------------------------------------------- 1 | ext4: add debugging counters for crypto allocations 2 | 3 | Signed-off-by: Theodore Ts'o 4 | --- 5 | fs/ext4/counter_debug_list.h | 3 +++ 6 | fs/ext4/ext4.h | 6 ++++++ 7 | fs/ext4/page-io.c | 2 ++ 8 | fs/ext4/sysfs.c | 19 +++++++++++++++++++ 9 | 4 files changed, 30 insertions(+) 10 | 11 | diff --git a/fs/ext4/counter_debug_list.h b/fs/ext4/counter_debug_list.h 12 | new file mode 100644 13 | index 0000000..a0eb6d2 14 | --- /dev/null 15 | +++ b/fs/ext4/counter_debug_list.h 16 | @@ -0,0 +1,3 @@ 17 | +EXT4_COUNTER_DEBUG(pageio_bio_submit) 18 | +EXT4_COUNTER_DEBUG(pageio_bio_finish) 19 | + 20 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 21 | index 766b7f7..e4990ac 100644 22 | --- a/fs/ext4/ext4.h 23 | +++ b/fs/ext4/ext4.h 24 | @@ -59,6 +59,12 @@ 25 | #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 26 | #endif 27 | 28 | +#define EXT4_DEBUG_COUNTER(x) atomic_inc(&ext4_##x) 29 | + 30 | +#define EXT4_COUNTER_DEBUG(x) extern atomic_t ext4_##x; 31 | +#include "counter_debug_list.h" 32 | +#undef EXT4_COUNTER_DEBUG 33 | + 34 | /* 35 | * Turn on EXT_DEBUG to get lots of info about extents operations. 36 | */ 37 | diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c 38 | index 17fbe38..29b5d61 100644 39 | --- a/fs/ext4/page-io.c 40 | +++ b/fs/ext4/page-io.c 41 | @@ -63,6 +63,7 @@ static void ext4_finish_bio(struct bio *bio) 42 | int i; 43 | struct bio_vec *bvec; 44 | 45 | + EXT4_DEBUG_COUNTER(pageio_bio_finish); 46 | bio_for_each_segment_all(bvec, bio, i) { 47 | struct page *page = bvec->bv_page; 48 | #ifdef CONFIG_EXT4_FS_ENCRYPTION 49 | @@ -358,6 +359,7 @@ void ext4_io_submit(struct ext4_io_submit *io) 50 | WRITE_SYNC : WRITE; 51 | bio_get(io->io_bio); 52 | submit_bio(io_op, io->io_bio); 53 | + EXT4_DEBUG_COUNTER(pageio_bio_submit); 54 | bio_put(io->io_bio); 55 | } 56 | io->io_bio = NULL; 57 | diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c 58 | index 62bef0f..12aa1bd 100644 59 | --- a/fs/ext4/sysfs.c 60 | +++ b/fs/ext4/sysfs.c 61 | @@ -233,6 +233,24 @@ static struct attribute *ext4_feat_attrs[] = { 62 | NULL, 63 | }; 64 | 65 | +#define EXT4_ATTR_DEBUG_COUNTER(_name) \ 66 | + EXT4_ATTR_PTR(_name, 0444, pointer_atomic, &ext4_##_name) 67 | + 68 | +#define EXT4_COUNTER_DEBUG(x) atomic_t ext4_##x; 69 | +#include "counter_debug_list.h" 70 | +#undef EXT4_COUNTER_DEBUG 71 | + 72 | +#define EXT4_COUNTER_DEBUG(x) EXT4_ATTR_DEBUG_COUNTER(x); 73 | +#include "counter_debug_list.h" 74 | +#undef EXT4_COUNTER_DEBUG 75 | + 76 | +#define EXT4_COUNTER_DEBUG(x) ATTR_LIST(x), 77 | +static struct attribute *ext4_global_attrs[] = { 78 | +#include "counter_debug_list.h" 79 | + NULL, 80 | +}; 81 | +#undef EXT4_COUNTER_DEBUG 82 | + 83 | static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) 84 | { 85 | switch (a->attr_ptr) { 86 | @@ -334,6 +352,7 @@ static struct kobj_type ext4_sb_ktype = { 87 | }; 88 | 89 | static struct kobj_type ext4_ktype = { 90 | + .default_attrs = ext4_global_attrs, 91 | .sysfs_ops = &ext4_attr_ops, 92 | }; 93 | 94 | -------------------------------------------------------------------------------- /old-patches/add-fallocate-mode-blocking-for-debugging: -------------------------------------------------------------------------------- 1 | ext4: add fallocate mode blocking for debugging purposes 2 | 3 | If a particular fallocate mode is causing test failures, give the 4 | tester the ability to block a particular fallocate mode so that the 5 | use of a particular fallocate mode will be reported as not supported. 6 | 7 | For example, if the COLLAPSE_RANGE fallocate mode is causing test 8 | failures, this allows us to suppress it so we can more easily test the 9 | rest of the file system code. 10 | 11 | Signed-off-by: "Theodore Ts'o" 12 | --- 13 | fs/ext4/extents.c | 18 ++++++++++++++++++ 14 | 1 file changed, 18 insertions(+) 15 | 16 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c 17 | index 64b4003..f477832 100644 18 | --- a/fs/ext4/extents.c 19 | +++ b/fs/ext4/extents.c 20 | @@ -29,6 +29,7 @@ 21 | * - smart tree reduction 22 | */ 23 | 24 | +#include 25 | #include 26 | #include 27 | #include 28 | @@ -4862,6 +4863,14 @@ out_mutex: 29 | return ret; 30 | } 31 | 32 | +#ifdef CONFIG_EXT4_DEBUG 33 | +int ext4_fallocate_mode_block __read_mostly; 34 | + 35 | +module_param_named(fallocate_mode_block, ext4_fallocate_mode_block, int, 0644); 36 | +MODULE_PARM_DESC(fallocate_mode_block, 37 | + "Fallocate modes which are blocked for debugging purposes"); 38 | +#endif 39 | + 40 | /* 41 | * preallocate space for a file. This implements ext4's fallocate file 42 | * operation, which gets called from sys_fallocate system call. 43 | @@ -4881,6 +4890,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 44 | struct timespec tv; 45 | unsigned int blkbits = inode->i_blkbits; 46 | 47 | +#ifdef CONFIG_EXT4_DEBUG 48 | + /* 49 | + * For debugging purposes, allow certain fallocate operations 50 | + * to be disabled 51 | + */ 52 | + if (unlikely(mode & ext4_fallocate_mode_block)) 53 | + return -EOPNOTSUPP; 54 | +#endif 55 | + 56 | /* Return error if mode is not supported */ 57 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 58 | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) 59 | -------------------------------------------------------------------------------- /old-patches/add-squelch-errors-support: -------------------------------------------------------------------------------- 1 | ext4: add option for squelching ext4 errors to prevent dmesg from filling up 2 | 3 | Only print one error per inode; this is enough to know that something 4 | is wrong with an inode, without filling dmesg by spamming the system 5 | with messages over and over again. 6 | 7 | This is enabled via sysfs option, which is currently off by default. 8 | Some environments may want to turn this on by default. Eventually we 9 | may want to make this be something which is tunable by a superblock 10 | flag, perhaps. 11 | 12 | Addresses-Google-Bug: #2507977 13 | 14 | Signed-off-by: "Theodore Ts'o" 15 | --- 16 | fs/ext4/ext4.h | 2 ++ 17 | fs/ext4/super.c | 2 ++ 18 | 2 files changed, 4 insertions(+) 19 | 20 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 21 | index fca1efb..fafa312 100644 22 | --- a/fs/ext4/ext4.h 23 | +++ b/fs/ext4/ext4.h 24 | @@ -1203,6 +1203,7 @@ struct ext4_super_block { 25 | #define EXT4_MF_MNTDIR_SAMPLED 0x0001 26 | #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ 27 | #define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 28 | +#define EXT4_MF_FS_SQUELCH 0x0008 /* Squelch file system errors */ 29 | 30 | #ifdef CONFIG_EXT4_FS_ENCRYPTION 31 | #define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ 32 | @@ -1450,6 +1451,7 @@ enum { 33 | EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 34 | EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 35 | EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ 36 | + EXT4_STATE_ERR_SQUELCHED, /* squeched error */ 37 | }; 38 | 39 | #define EXT4_INODE_BIT_FNS(name, field, offset) \ 40 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 41 | index f106700..e57ce82 100644 42 | --- a/fs/ext4/super.c 43 | +++ b/fs/ext4/super.c 44 | @@ -2750,6 +2750,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 45 | EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); 46 | EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); 47 | EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); 48 | +EXT4_RW_ATTR_SBI_BOOL(squelch_errors, s_mount_flags, EXT4_MF_FS_SQUELCH); 49 | 50 | static struct attribute *ext4_attrs[] = { 51 | ATTR_LIST(delayed_allocation_blocks), 52 | @@ -2776,6 +2777,7 @@ static struct attribute *ext4_attrs[] = { 53 | ATTR_LIST(errors_count), 54 | ATTR_LIST(first_error_time), 55 | ATTR_LIST(last_error_time), 56 | + ATTR_LIST(squelch_errors), 57 | NULL, 58 | }; 59 | 60 | -------------------------------------------------------------------------------- /old-patches/add-sysfs-bool-support: -------------------------------------------------------------------------------- 1 | ext4: add support for adding boolean toggles to ext4's sysfs directory 2 | 3 | Signed-off-by: "Theodore Ts'o" 4 | --- 5 | fs/ext4/super.c | 38 ++++++++++++++++++++++++++++++++++---- 6 | 1 file changed, 34 insertions(+), 4 deletions(-) 7 | 8 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 9 | index bf1b8a8..2d294b7 100644 10 | --- a/fs/ext4/super.c 11 | +++ b/fs/ext4/super.c 12 | @@ -2535,6 +2535,7 @@ struct ext4_attr { 13 | int offset; 14 | int deprecated_val; 15 | } u; 16 | + unsigned int mask; 17 | }; 18 | 19 | static int parse_strtoull(const char *buf, 20 | @@ -2677,7 +2678,33 @@ static ssize_t sbi_deprecated_show(struct ext4_attr *a, 21 | return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); 22 | } 23 | 24 | -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 25 | +static ssize_t sbi_bool_show(struct ext4_attr *a, 26 | + struct ext4_sb_info *sbi, char *buf) 27 | +{ 28 | + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); 29 | + 30 | + return snprintf(buf, PAGE_SIZE, "%d\n", 31 | + ((*ui & a->mask) == 0) ? 0 : 1); 32 | +} 33 | + 34 | +static ssize_t sbi_bool_store(struct ext4_attr *a, 35 | + struct ext4_sb_info *sbi, 36 | + const char *buf, size_t count) 37 | +{ 38 | + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); 39 | + unsigned long long t; 40 | + 41 | + if (parse_strtoull(buf, 0xffffffff, &t)) 42 | + return -EINVAL; 43 | + if (t) 44 | + *ui |= a->mask; 45 | + else 46 | + *ui &= ~a->mask; 47 | + 48 | + return count; 49 | +} 50 | + 51 | +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname,_mask)\ 52 | static struct ext4_attr ext4_attr_##_name = { \ 53 | .attr = {.name = __stringify(_name), .mode = _mode }, \ 54 | .show = _show, \ 55 | @@ -2685,6 +2712,7 @@ static struct ext4_attr ext4_attr_##_name = { \ 56 | .u = { \ 57 | .offset = offsetof(struct ext4_sb_info, _elname),\ 58 | }, \ 59 | + .mask = (_mask), \ 60 | } 61 | 62 | #define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ 63 | @@ -2707,8 +2735,10 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 64 | #define EXT4_RO_ATTR_ES_UI(name, elname) \ 65 | EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) 66 | #define EXT4_RW_ATTR_SBI_UI(name, elname) \ 67 | - EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 68 | - 69 | + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname, 0) 70 | +#define EXT4_RW_ATTR_SBI_BOOL(name, elname, mask) \ 71 | + EXT4_ATTR_OFFSET(name, 0644, sbi_bool_show, sbi_bool_store, \ 72 | + elname, mask) 73 | #define ATTR_LIST(name) &ext4_attr_##name.attr 74 | #define EXT4_DEPRECATED_ATTR(_name, _val) \ 75 | static struct ext4_attr ext4_attr_##_name = { \ 76 | @@ -2724,7 +2754,7 @@ EXT4_RO_ATTR(session_write_kbytes); 77 | EXT4_RO_ATTR(lifetime_write_kbytes); 78 | EXT4_RW_ATTR(reserved_clusters); 79 | EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 80 | - inode_readahead_blks_store, s_inode_readahead_blks); 81 | + inode_readahead_blks_store, s_inode_readahead_blks, 0); 82 | EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 83 | EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); 84 | EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); 85 | -------------------------------------------------------------------------------- /old-patches/akpm-jbd2-locking-fix: -------------------------------------------------------------------------------- 1 | ext4: akpm's locking hack to fix locking delays 2 | 3 | This is a port of the following patch from Andrew Morton to ext4: 4 | 5 | http://lkml.org/lkml/2008/10/3/22 6 | 7 | This fixes a major contention problem in do_get_write_access() when a 8 | buffer is modified in both the current and committing transaction. 9 | 10 | Signed-off-by: "Theodore Ts'o" 11 | Cc: akpm@linux-foundation.org 12 | --- 13 | fs/ext4/ext4.h | 3 +++ 14 | fs/ext4/super.c | 11 +++++++++++ 15 | fs/jbd2/transaction.c | 12 ++++++++++-- 16 | include/linux/jbd2.h | 1 + 17 | 4 files changed, 25 insertions(+), 2 deletions(-) 18 | 19 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 20 | index b7dbaf1..c5b26f7 100644 21 | --- a/fs/ext4/ext4.h 22 | +++ b/fs/ext4/ext4.h 23 | @@ -1006,6 +1006,9 @@ struct ext4_inode_info { 24 | #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated 25 | file systems */ 26 | 27 | +#define EXT4_MOUNT2_AKPM_LOCK_HACK 0x80000000 /* akpm lock hack */ 28 | + 29 | + 30 | #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 31 | ~EXT4_MOUNT_##opt 32 | #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ 33 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 34 | index 7b3a41c..ccab545 100644 35 | --- a/fs/ext4/super.c 36 | +++ b/fs/ext4/super.c 37 | @@ -1135,6 +1135,7 @@ enum { 38 | Opt_dioread_nolock, Opt_dioread_lock, 39 | Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 40 | Opt_max_dir_size_kb, Opt_nojournal_checksum, 41 | + Opt_akpm_lock_hack, 42 | }; 43 | 44 | static const match_table_t tokens = { 45 | @@ -1193,6 +1194,7 @@ static const match_table_t tokens = { 46 | {Opt_i_version, "i_version"}, 47 | {Opt_dax, "dax"}, 48 | {Opt_stripe, "stripe=%u"}, 49 | + {Opt_akpm_lock_hack, "akpm_lock_hack"}, 50 | {Opt_delalloc, "delalloc"}, 51 | {Opt_lazytime, "lazytime"}, 52 | {Opt_nolazytime, "nolazytime"}, 53 | @@ -1460,6 +1462,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, 54 | case Opt_nolazytime: 55 | sb->s_flags &= ~MS_LAZYTIME; 56 | return 1; 57 | + case Opt_akpm_lock_hack: 58 | + set_opt2(sb, AKPM_LOCK_HACK); 59 | + return 1; 60 | } 61 | 62 | for (m = ext4_mount_opts; m->token != Opt_err; m++) 63 | @@ -1813,6 +1818,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, 64 | SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); 65 | if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) 66 | SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); 67 | + if (test_opt2(sb, AKPM_LOCK_HACK)) 68 | + seq_puts(seq, ",akpm_lock_hack"); 69 | if (sb->s_flags & MS_I_VERSION) 70 | SEQ_OPTS_PUTS("i_version"); 71 | if (nodefs || sbi->s_stripe) 72 | @@ -4442,6 +4449,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) 73 | journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 74 | else 75 | journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 76 | + if (test_opt2(sb, AKPM_LOCK_HACK)) 77 | + journal->j_flags |= JBD2_LOCK_HACK; 78 | + else 79 | + journal->j_flags &= ~JBD2_LOCK_HACK; 80 | write_unlock(&journal->j_state_lock); 81 | } 82 | 83 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c 84 | index 93ffee2..4ba2b76 100644 85 | --- a/fs/jbd2/transaction.c 86 | +++ b/fs/jbd2/transaction.c 87 | @@ -784,6 +784,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, 88 | char *frozen_buffer = NULL; 89 | int need_copy = 0; 90 | unsigned long start_lock, time_lock; 91 | + int locked = 0; 92 | 93 | WARN_ON(!transaction); 94 | if (is_handle_aborted(handle)) 95 | @@ -799,7 +800,13 @@ repeat: 96 | /* @@@ Need to check for errors here at some point. */ 97 | 98 | start_lock = jiffies; 99 | - lock_buffer(bh); 100 | + if (journal->j_flags & JBD2_LOCK_HACK) { 101 | + if (trylock_buffer(bh)) 102 | + locked = 1; /* lolz */ 103 | + } else { 104 | + lock_buffer(bh); 105 | + locked = 1; 106 | + } 107 | jbd_lock_bh_state(bh); 108 | 109 | /* If it takes too long to lock the buffer, trace it */ 110 | @@ -846,7 +853,8 @@ repeat: 111 | set_buffer_jbddirty(bh); 112 | } 113 | 114 | - unlock_buffer(bh); 115 | + if (locked) 116 | + unlock_buffer(bh); 117 | 118 | error = -EROFS; 119 | if (is_handle_aborted(handle)) { 120 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h 121 | index 20e7f78..0f17d76 100644 122 | --- a/include/linux/jbd2.h 123 | +++ b/include/linux/jbd2.h 124 | @@ -1007,6 +1007,7 @@ struct journal_s 125 | #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file 126 | * data write error in ordered 127 | * mode */ 128 | +#define JBD2_LOCK_HACK 0x080 /* akpm's locking hack */ 129 | 130 | /* 131 | * Function declarations for the journaling transaction and buffer 132 | -------------------------------------------------------------------------------- /old-patches/block-dio-during-truncate: -------------------------------------------------------------------------------- 1 | ext4: block direct I/O writes during ext4_truncate 2 | 3 | Just as in ext4_punch_hole() it is important that we block DIO writes 4 | while the truncate is proceeding, since during the overwriting DIO 5 | write, we drop i_mutex, which means a truncate could start while the 6 | Direct I/O operation is still in progress. 7 | 8 | Signed-off-by: "Theodore Ts'o" 9 | Cc: stable@vger.kernel.org 10 | --- 11 | fs/ext4/inode.c | 10 ++++++++-- 12 | 1 file changed, 8 insertions(+), 2 deletions(-) 13 | 14 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 15 | index 98b9bff..3c5edf2 100644 16 | --- a/fs/ext4/inode.c 17 | +++ b/fs/ext4/inode.c 18 | @@ -3659,12 +3659,16 @@ void ext4_truncate(struct inode *inode) 19 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 20 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 21 | 22 | + /* Wait all existing dio workers, newcomers will block on i_mutex */ 23 | + ext4_inode_block_unlocked_dio(inode); 24 | + inode_dio_wait(inode); 25 | + 26 | if (ext4_has_inline_data(inode)) { 27 | int has_inline = 1; 28 | 29 | ext4_inline_data_truncate(inode, &has_inline); 30 | if (has_inline) 31 | - return; 32 | + goto out_resume; 33 | } 34 | 35 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 36 | @@ -3675,7 +3679,7 @@ void ext4_truncate(struct inode *inode) 37 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 38 | if (IS_ERR(handle)) { 39 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); 40 | - return; 41 | + goto out_resume; 42 | } 43 | 44 | if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 45 | @@ -3722,6 +3726,8 @@ out_stop: 46 | ext4_mark_inode_dirty(handle, inode); 47 | ext4_journal_stop(handle); 48 | 49 | +out_resume: 50 | + ext4_inode_resume_unlocked_dio(inode); 51 | trace_ext4_truncate_exit(inode); 52 | } 53 | 54 | -------------------------------------------------------------------------------- /old-patches/commit-as-soon-as-possible-after-log_start_commit: -------------------------------------------------------------------------------- 1 | jbd2: commit as soon as possible after log_start_commit 2 | 3 | Once a transaction has been requested to be committed, don't let any 4 | other handles start under that transaction, and don't allow any 5 | pending transactions to be extended (i.e., in the case of 6 | unlink/ftruncate). 7 | 8 | The idea is once the transaction has had log_start_commit() called on 9 | it, at least one thread is blocked waiting for that transaction to 10 | commit, and over time, more and more threads will end up getting 11 | blocked. In order to avoid high variability in file system operations 12 | getting blocked behind the a blocked start_this_handle(), we should 13 | try to get the commit started as soon as possible. 14 | 15 | Signed-off-by: "Theodore Ts'o" 16 | --- 17 | fs/jbd2/transaction.c | 4 ++-- 18 | 1 file changed, 2 insertions(+), 2 deletions(-) 19 | 20 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c 21 | index 7aa9a32..e1c0b4a 100644 22 | --- a/fs/jbd2/transaction.c 23 | +++ b/fs/jbd2/transaction.c 24 | @@ -186,7 +186,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, 25 | * If the current transaction is locked down for commit, wait 26 | * for the lock to be released. 27 | */ 28 | - if (t->t_state == T_LOCKED) { 29 | + if (t->t_state == T_LOCKED || t->t_requested) { 30 | wait_transaction_locked(journal); 31 | return 1; 32 | } 33 | @@ -559,7 +559,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) 34 | read_lock(&journal->j_state_lock); 35 | 36 | /* Don't extend a locked-down transaction! */ 37 | - if (transaction->t_state != T_RUNNING) { 38 | + if (transaction->t_state != T_RUNNING || transaction->t_requested) { 39 | jbd_debug(3, "denied handle %p %d blocks: " 40 | "transaction not running\n", handle, nblocks); 41 | goto error_out; 42 | -------------------------------------------------------------------------------- /old-patches/crypto-add-ciphertext_access-mount-option: -------------------------------------------------------------------------------- 1 | ext4 crypto: add ciphertext_access mount option 2 | 3 | Add a mount option which allows root to be able to access the 4 | ciphertext of a file by reading it using O_DIRECT. 5 | 6 | Signed-off-by: Theodore Ts'o 7 | --- 8 | fs/ext4/ext4.h | 1 + 9 | fs/ext4/file.c | 5 ++++- 10 | fs/ext4/indirect.c | 5 ++--- 11 | fs/ext4/inode.c | 17 ++++++++++------- 12 | fs/ext4/super.c | 5 +++++ 13 | 5 files changed, 22 insertions(+), 11 deletions(-) 14 | 15 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 16 | index 1e20fa9..44278aa7 100644 17 | --- a/fs/ext4/ext4.h 18 | +++ b/fs/ext4/ext4.h 19 | @@ -1052,6 +1052,7 @@ struct ext4_inode_info { 20 | #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ 21 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 22 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 23 | +#define EXT4_MOUNT_CIPHERTEXT_ACCESS 0x2000000 /* Direct I/O to ciphertext */ 24 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 25 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 26 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 27 | diff --git a/fs/ext4/file.c b/fs/ext4/file.c 28 | index 749b222..60683ab 100644 29 | --- a/fs/ext4/file.c 30 | +++ b/fs/ext4/file.c 31 | @@ -388,7 +388,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) 32 | ret = ext4_get_encryption_info(inode); 33 | if (ret) 34 | return -EACCES; 35 | - if (ext4_encryption_info(inode) == NULL) 36 | + if ((ext4_encryption_info(inode) == NULL) && 37 | + !(test_opt(inode->i_sb, CIPHERTEXT_ACCESS) && 38 | + ((filp->f_flags & O_ACCMODE) == O_RDONLY) && 39 | + capable(CAP_SYS_ADMIN))) 40 | return -ENOKEY; 41 | } 42 | /* 43 | diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c 44 | index 355ef9c..bd9d89e 100644 45 | --- a/fs/ext4/indirect.c 46 | +++ b/fs/ext4/indirect.c 47 | @@ -655,11 +655,10 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 48 | int orphan = 0; 49 | size_t count = iov_iter_count(iter); 50 | int retries = 0; 51 | + loff_t final_size = offset + count; 52 | 53 | if (iov_iter_rw(iter) == WRITE) { 54 | - loff_t final_size = offset + count; 55 | - 56 | - if (final_size > inode->i_size) { 57 | + if (final_size > i_size_read(inode)) { 58 | /* Credits for sb + inode write */ 59 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 60 | if (IS_ERR(handle)) { 61 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 62 | index ff2f3cd..16f6537 100644 63 | --- a/fs/ext4/inode.c 64 | +++ b/fs/ext4/inode.c 65 | @@ -3279,9 +3279,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 66 | get_block_func = ext4_get_block_write; 67 | dio_flags = DIO_LOCKING; 68 | } 69 | -#ifdef CONFIG_EXT4_FS_ENCRYPTION 70 | - BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 71 | -#endif 72 | if (IS_DAX(inode)) 73 | ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 74 | ext4_end_io_dio, dio_flags); 75 | @@ -3344,10 +3341,16 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 76 | size_t count = iov_iter_count(iter); 77 | ssize_t ret; 78 | 79 | -#ifdef CONFIG_EXT4_FS_ENCRYPTION 80 | - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 81 | - return 0; 82 | -#endif 83 | + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { 84 | + if (iov_iter_rw(iter) == WRITE) 85 | + return 0; 86 | + if (test_opt(inode->i_sb, CIPHERTEXT_ACCESS) && 87 | + capable(CAP_SYS_ADMIN)) { 88 | + if (iov_iter_rw(iter) == WRITE) 89 | + return -EPERM; 90 | + } else 91 | + return 0; 92 | + } 93 | 94 | /* 95 | * If we are doing data journalling we don't support O_DIRECT 96 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c 97 | index 486e869..de875b4 100644 98 | --- a/fs/ext4/super.c 99 | +++ b/fs/ext4/super.c 100 | @@ -1182,6 +1182,7 @@ enum { 101 | Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, 102 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 103 | Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, 104 | + Opt_ciphertext_access, Opt_nociphertext_access, 105 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 106 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 107 | Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 108 | @@ -1273,6 +1274,8 @@ static const match_table_t tokens = { 109 | {Opt_noinit_itable, "noinit_itable"}, 110 | {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, 111 | {Opt_test_dummy_encryption, "test_dummy_encryption"}, 112 | + {Opt_ciphertext_access, "ciphertext_access"}, 113 | + {Opt_nociphertext_access, "nociphertext_access"}, 114 | {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 115 | {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 116 | {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 117 | @@ -1475,6 +1478,8 @@ static const struct mount_opts { 118 | {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 119 | {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 120 | {Opt_test_dummy_encryption, 0, MOPT_GTE0}, 121 | + {Opt_ciphertext_access, EXT4_MOUNT_CIPHERTEXT_ACCESS, MOPT_SET}, 122 | + {Opt_nociphertext_access, EXT4_MOUNT_CIPHERTEXT_ACCESS, MOPT_CLEAR}, 123 | {Opt_err, 0, 0} 124 | }; 125 | 126 | -------------------------------------------------------------------------------- /old-patches/crypto-add-ioctls-to-backup-crypto-metadata: -------------------------------------------------------------------------------- 1 | ext4 crypto: add ioctls to allow backup of encryption metadata 2 | 3 | Add new ioctls which allow for the metadata of encrypted files (both 4 | the filename and the crypto policy) to be backed up and restored. 5 | 6 | [ Included fix from Dan Carpenter for a missing mutex_unlock. ] 7 | 8 | Signed-off-by: Theodore Ts'o 9 | Signed-off-by: Dan Carpenter 10 | --- 11 | fs/ext4/crypto_key.c | 127 +++++++++++++++++++++++++++++++++- 12 | fs/ext4/crypto_policy.c | 10 +++ 13 | fs/ext4/ext4.h | 25 ++++++- 14 | fs/ext4/ext4_crypto.h | 14 ++++ 15 | fs/ext4/ialloc.c | 5 +- 16 | fs/ext4/ioctl.c | 113 +++++++++++++++++++++++++++++++ 17 | fs/ext4/namei.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 18 | 7 files changed, 571 insertions(+), 30 deletions(-) 19 | 20 | diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c 21 | index 0129d68..865e41d 100644 22 | --- a/fs/ext4/crypto_key.c 23 | +++ b/fs/ext4/crypto_key.c 24 | @@ -11,11 +11,12 @@ 25 | #include 26 | #include 27 | #include 28 | +#include 29 | #include 30 | #include 31 | #include 32 | 33 | -#include "ext4.h" 34 | +#include "ext4_jbd2.h" 35 | #include "xattr.h" 36 | 37 | static void derive_crypt_complete(struct crypto_async_request *req, int rc) 38 | @@ -272,3 +273,127 @@ int ext4_has_encryption_key(struct inode *inode) 39 | 40 | return (ei->i_crypt_info != NULL); 41 | } 42 | + 43 | +int ext4_get_encryption_metadata(struct inode *inode, 44 | + struct ext4_rw_enc_mdata *mdata) 45 | +{ 46 | + unsigned char *cp = mdata->buf; 47 | + size_t size = mdata->u.len; 48 | + loff_t isize; 49 | + int res; 50 | + 51 | + if (size < sizeof(struct ext4_encryption_context) + 12) 52 | + return -EINVAL; 53 | + 54 | + if (!inode_owner_or_capable(inode) && !capable(CAP_SYS_ADMIN)) 55 | + return -EACCES; 56 | + 57 | + *cp++ = 'e'; 58 | + *cp++ = '5'; 59 | + *cp++ = 0; 60 | + *cp++ = 0; 61 | + isize = i_size_read(inode); 62 | + *((u32 *)cp) = cpu_to_le32(isize & 0xFFFFFFFF); 63 | + cp += 4; 64 | + *((u32 *)cp) = cpu_to_le32(isize >> 32); 65 | + cp += 4; 66 | + size -= 12; 67 | + 68 | + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, 69 | + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 70 | + cp, size); 71 | + 72 | + if (res < 0) 73 | + return res; 74 | + if (res > size) 75 | + return -ENOSPC; 76 | + 77 | + mdata->u.len = res + 12; 78 | + 79 | + *((u16 *) &mdata->buf[2]) = cpu_to_le16(crc16(~0, mdata->buf, mdata->u.len)); 80 | + return 0; 81 | +} 82 | + 83 | +int ext4_set_encryption_metadata(struct inode *inode, 84 | + struct ext4_rw_enc_mdata *mdata) 85 | +{ 86 | + struct ext4_encryption_context *ctx; 87 | + unsigned char *cp = mdata->buf; 88 | + handle_t *handle = NULL; 89 | + loff_t size; 90 | + unsigned bs = inode->i_sb->s_blocksize; 91 | + int res; 92 | + u16 crc; 93 | + 94 | + if (!inode_owner_or_capable(inode) && !capable(CAP_SYS_ADMIN)) 95 | + return -EACCES; 96 | + 97 | + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 98 | + return -EINVAL; 99 | + 100 | + if (mdata->u.len != sizeof(struct ext4_encryption_context) + 12) 101 | + return -EINVAL; 102 | + 103 | + if (cp[0] != 'e' || cp[1] != '5') 104 | + return -EINVAL; 105 | + crc = le16_to_cpu(*(u16 *)(cp+2)); 106 | + cp[2] = cp[3] = 0; 107 | + cp += 4; 108 | + 109 | + if (crc != crc16(~0, mdata->buf, mdata->u.len)) 110 | + return -EINVAL; 111 | + 112 | + size = le32_to_cpu(*(u32 *) cp); 113 | + cp += 4; 114 | + size += ((u64) le32_to_cpu(*(u32 *) cp)) << 32; 115 | + cp += 4; 116 | + 117 | + ctx = (struct ext4_encryption_context *) cp; 118 | + res = ext4_validate_encryption_context(ctx); 119 | + if (res) 120 | + return res; 121 | + 122 | + res = ext4_convert_inline_data(inode); 123 | + if (res) 124 | + return res; 125 | + 126 | + res = filemap_write_and_wait(&inode->i_data); 127 | + if (res) 128 | + return res; 129 | + 130 | + mutex_lock(&inode->i_mutex); 131 | + if (S_ISREG(inode->i_mode) && 132 | + round_up(size, bs) != round_up(i_size_read(inode), bs)) { 133 | + res = -EINVAL; 134 | + goto errout; 135 | + } 136 | + 137 | + handle = ext4_journal_start(inode, EXT4_HT_MISC, 138 | + ext4_jbd2_credits_xattr(inode)); 139 | + if (IS_ERR(handle)) { 140 | + res = PTR_ERR(handle); 141 | + goto errout; 142 | + } 143 | + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, 144 | + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, 145 | + sizeof(struct ext4_encryption_context), 0); 146 | + if (res < 0) 147 | + goto errout; 148 | + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 149 | + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 150 | + 151 | + if (S_ISREG(inode->i_mode)) { 152 | + i_size_write(inode, size); 153 | + EXT4_I(inode)->i_disksize = size; 154 | + } 155 | + res = ext4_mark_inode_dirty(handle, inode); 156 | + if (res) 157 | + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); 158 | + else 159 | + res = ext4_get_encryption_info(inode); 160 | +errout: 161 | + mutex_unlock(&inode->i_mutex); 162 | + if (handle) 163 | + ext4_journal_stop(handle); 164 | + return res; 165 | +} 166 | diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c 167 | index ad05069..08565f5 100644 168 | --- a/fs/ext4/crypto_policy.c 169 | +++ b/fs/ext4/crypto_policy.c 170 | @@ -180,6 +180,16 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, 171 | (parent_ci->ci_flags == child_ci->ci_flags)); 172 | } 173 | 174 | +int ext4_validate_encryption_context(struct ext4_encryption_context *ctx) 175 | +{ 176 | + if ((ctx->format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) || 177 | + !ext4_valid_contents_enc_mode(ctx->contents_encryption_mode) || 178 | + !ext4_valid_filenames_enc_mode(ctx->filenames_encryption_mode) || 179 | + (ctx->flags & ~EXT4_POLICY_FLAGS_VALID)) 180 | + return -EINVAL; 181 | + return 0; 182 | +} 183 | + 184 | /** 185 | * ext4_inherit_context() - Sets a child context from its parent 186 | * @parent: Parent inode from which the context is inherited. 187 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 188 | index 9a92f8b..bb2c31d 100644 189 | --- a/fs/ext4/ext4.h 190 | +++ b/fs/ext4/ext4.h 191 | @@ -637,6 +637,10 @@ enum { 192 | #define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) 193 | #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) 194 | #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) 195 | +#define EXT4_IOC_GET_ENCRYPTION_METADATA _IOWR('f', 22, struct ext4_encrypted_metadata) 196 | +#define EXT4_IOC_SET_ENCRYPTION_METADATA _IOR('f', 23, struct ext4_encrypted_metadata) 197 | +#define EXT4_IOC_GET_ENCRYPTED_FILENAME _IOWR('f', 24, struct ext4_encrypted_metadata) 198 | +#define EXT4_IOC_SET_ENCRYPTED_FILENAME _IOR('f', 25, struct ext4_encrypted_metadata) 199 | 200 | #ifndef FS_IOC_FSGETXATTR 201 | /* Until the uapi changes get merged for project quota... */ 202 | @@ -2294,6 +2298,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 203 | /* crypto_policy.c */ 204 | int ext4_is_child_context_consistent_with_parent(struct inode *parent, 205 | struct inode *child); 206 | +int ext4_validate_encryption_context(struct ext4_encryption_context *ctx); 207 | int ext4_inherit_context(struct inode *parent, struct inode *child); 208 | void ext4_to_hex(char *dst, char *src, size_t src_size); 209 | int ext4_process_policy(const struct ext4_encryption_policy *policy, 210 | @@ -2380,6 +2385,10 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } 211 | void ext4_free_crypt_info(struct ext4_crypt_info *ci); 212 | void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); 213 | int _ext4_get_encryption_info(struct inode *inode); 214 | +int ext4_set_encryption_metadata(struct inode *inode, 215 | + struct ext4_rw_enc_mdata *mdata); 216 | +int ext4_get_encryption_metadata(struct inode *inode, 217 | + struct ext4_rw_enc_mdata *mdata); 218 | 219 | #ifdef CONFIG_EXT4_FS_ENCRYPTION 220 | int ext4_has_encryption_key(struct inode *inode); 221 | @@ -2469,18 +2478,24 @@ extern int ext4fs_dirhash(const char *name, int len, struct 222 | dx_hash_info *hinfo); 223 | 224 | /* ialloc.c */ 225 | +#define EXT4_NEW_INODE_NOENCRYPT 0x0001 226 | extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, 227 | const struct qstr *qstr, __u32 goal, 228 | uid_t *owner, int handle_type, 229 | - unsigned int line_no, int nblocks); 230 | + unsigned int line_no, int nblocks, 231 | + int flags); 232 | 233 | #define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ 234 | __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ 235 | - 0, 0, 0) 236 | + 0, 0, 0, 0) 237 | #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ 238 | type, nblocks) \ 239 | __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ 240 | - (type), __LINE__, (nblocks)) 241 | + (type), __LINE__, (nblocks), 0) 242 | +#define ext4_new_inode_start_handle_flags(dir, mode, qstr, goal, owner, \ 243 | + type, nblocks, flags) \ 244 | + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ 245 | + (type), __LINE__, (nblocks), (flags)) 246 | 247 | 248 | extern void ext4_free_inode(handle_t *, struct inode *); 249 | @@ -2621,6 +2636,10 @@ extern int ext4_generic_delete_entry(handle_t *handle, 250 | int buf_size, 251 | int csum_size); 252 | extern int ext4_empty_dir(struct inode *inode); 253 | +extern int ext4_get_encrypted_filename(struct file *filp, 254 | + struct ext4_rw_enc_mdata *mdata); 255 | +extern int ext4_set_encrypted_filename(struct inode *dir, 256 | + struct ext4_rw_enc_mdata *efn); 257 | 258 | /* resize.c */ 259 | extern int ext4_group_add(struct super_block *sb, 260 | diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h 261 | index 1f73c29..600da7e 100644 262 | --- a/fs/ext4/ext4_crypto.h 263 | +++ b/fs/ext4/ext4_crypto.h 264 | @@ -156,4 +156,18 @@ static inline u32 encrypted_symlink_data_len(u32 l) 265 | return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); 266 | } 267 | 268 | +/** 269 | + * Structure used for communicating encrypted metadata with userspace 270 | + */ 271 | +struct ext4_encrypted_metadata { 272 | + s32 fd; /* Only used by EXT4_IOC_SET_ENCRYPTED_FILENAME */ 273 | + u32 len; 274 | + unsigned char __user *data; 275 | +}; 276 | + 277 | +/* In-kernel structure */ 278 | +struct ext4_rw_enc_mdata { 279 | + struct ext4_encrypted_metadata u; 280 | + unsigned char *buf; 281 | +}; 282 | #endif /* _EXT4_CRYPTO_H */ 283 | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c 284 | index 237b877..e96e4ae 100644 285 | --- a/fs/ext4/ialloc.c 286 | +++ b/fs/ext4/ialloc.c 287 | @@ -742,7 +742,7 @@ out: 288 | struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, 289 | umode_t mode, const struct qstr *qstr, 290 | __u32 goal, uid_t *owner, int handle_type, 291 | - unsigned int line_no, int nblocks) 292 | + unsigned int line_no, int nblocks, int flags) 293 | { 294 | struct super_block *sb; 295 | struct buffer_head *inode_bitmap_bh = NULL; 296 | @@ -764,7 +764,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, 297 | if (!dir || !dir->i_nlink) 298 | return ERR_PTR(-EPERM); 299 | 300 | - if ((ext4_encrypted_inode(dir) || 301 | + if (!(flags & EXT4_NEW_INODE_NOENCRYPT) && 302 | + (ext4_encrypted_inode(dir) || 303 | DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && 304 | (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 305 | err = ext4_get_encryption_info(dir); 306 | diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c 307 | index eae5917..65d38e9 100644 308 | --- a/fs/ext4/ioctl.c 309 | +++ b/fs/ext4/ioctl.c 310 | @@ -435,6 +435,39 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) 311 | return iflags; 312 | } 313 | 314 | +#ifdef CONFIG_EXT4_FS_ENCRYPTION 315 | +static int read_user_mdata(unsigned long arg, 316 | + struct ext4_rw_enc_mdata *mdata) 317 | +{ 318 | + if (copy_from_user(&mdata->u, 319 | + (struct ext4_encrypted_metadata __user *)arg, 320 | + sizeof(struct ext4_encrypted_metadata))) 321 | + return -EFAULT; 322 | + /* Sanity check, as nothing should need to be this big */ 323 | + if (mdata->u.len > PAGE_SIZE) 324 | + return -EINVAL; 325 | + mdata->buf = kmalloc(mdata->u.len, GFP_KERNEL); 326 | + if (!mdata->buf) 327 | + return -ENOMEM; 328 | + if (copy_from_user(mdata->buf, mdata->u.data, mdata->u.len)) 329 | + return -EFAULT; 330 | + return 0; 331 | + 332 | +} 333 | + 334 | +static int write_user_mdata(unsigned long arg, 335 | + struct ext4_rw_enc_mdata *mdata) 336 | +{ 337 | + if (copy_to_user(mdata->u.data, mdata->buf, mdata->u.len)) 338 | + return -EFAULT; 339 | + if (copy_to_user((struct ext4_encrypted_metadata __user *)arg, 340 | + &mdata->u, 341 | + sizeof(struct ext4_encrypted_metadata))) 342 | + return -EFAULT; 343 | + return 0; 344 | +} 345 | +#endif 346 | + 347 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 348 | { 349 | struct inode *inode = file_inode(filp); 350 | @@ -896,6 +929,83 @@ encryption_policy_out: 351 | 352 | return 0; 353 | } 354 | + case EXT4_IOC_GET_ENCRYPTION_METADATA: { 355 | +#ifdef CONFIG_EXT4_FS_ENCRYPTION 356 | + struct ext4_rw_enc_mdata mdata; 357 | + int err = 0; 358 | + 359 | + if (!ext4_encrypted_inode(inode)) 360 | + return -ENOENT; 361 | + 362 | + err = read_user_mdata(arg, &mdata); 363 | + if (err) 364 | + return err; 365 | + err = ext4_get_encryption_metadata(inode, &mdata); 366 | + if (!err) 367 | + err = write_user_mdata(arg, &mdata); 368 | + kfree(mdata.buf); 369 | + return err; 370 | +#else 371 | + return -EOPNOTSUPP; 372 | +#endif 373 | + } 374 | + case EXT4_IOC_SET_ENCRYPTION_METADATA: { 375 | +#ifdef CONFIG_EXT4_FS_ENCRYPTION 376 | + struct ext4_rw_enc_mdata mdata; 377 | + int err = 0; 378 | + 379 | + if (ext4_encrypted_inode(inode)) 380 | + return -EINVAL; 381 | + err = read_user_mdata(arg, &mdata); 382 | + if (err) 383 | + return err; 384 | + err = mnt_want_write_file(filp); 385 | + if (!err) 386 | + err = ext4_set_encryption_metadata(inode, &mdata); 387 | + mnt_drop_write_file(filp); 388 | + kfree(mdata.buf); 389 | + return err; 390 | +#else 391 | + return -EOPNOTSUPP; 392 | +#endif 393 | + } 394 | + case EXT4_IOC_GET_ENCRYPTED_FILENAME: { 395 | +#ifdef CONFIG_EXT4_FS_ENCRYPTION 396 | + struct ext4_rw_enc_mdata mdata; 397 | + int err = 0; 398 | + 399 | + if (!ext4_encrypted_inode(inode)) 400 | + return -ENOENT; 401 | + err = read_user_mdata(arg, &mdata); 402 | + if (err) 403 | + return err; 404 | + err = ext4_get_encrypted_filename(filp, &mdata); 405 | + if (!err) 406 | + err = write_user_mdata(arg, &mdata); 407 | + kfree(mdata.buf); 408 | + return err; 409 | +#else 410 | + return -EOPNOTSUPP; 411 | +#endif 412 | + } 413 | + case EXT4_IOC_SET_ENCRYPTED_FILENAME: { 414 | +#ifdef CONFIG_EXT4_FS_ENCRYPTION 415 | + struct ext4_rw_enc_mdata mdata; 416 | + int err = 0; 417 | + 418 | + err = read_user_mdata(arg, &mdata); 419 | + if (err) 420 | + return err; 421 | + err = mnt_want_write_file(filp); 422 | + if (!err) 423 | + err = ext4_set_encrypted_filename(inode, &mdata); 424 | + mnt_drop_write_file(filp); 425 | + kfree(mdata.buf); 426 | + return err; 427 | +#else 428 | + return -EOPNOTSUPP; 429 | +#endif 430 | + } 431 | default: 432 | return -ENOTTY; 433 | } 434 | @@ -962,6 +1072,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 435 | case EXT4_IOC_SET_ENCRYPTION_POLICY: 436 | case EXT4_IOC_GET_ENCRYPTION_PWSALT: 437 | case EXT4_IOC_GET_ENCRYPTION_POLICY: 438 | + case EXT4_IOC_GET_ENCRYPTION_METADATA: 439 | + case EXT4_IOC_SET_ENCRYPTION_METADATA: 440 | + case EXT4_IOC_GET_ENCRYPTED_FILENAME: 441 | break; 442 | default: 443 | return -ENOIOCTLCMD; 444 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c 445 | index c07422d..32edbe3 100644 446 | --- a/fs/ext4/namei.c 447 | +++ b/fs/ext4/namei.c 448 | @@ -33,6 +33,8 @@ 449 | #include 450 | #include 451 | #include 452 | +#include 453 | +#include 454 | #include "ext4.h" 455 | #include "ext4_jbd2.h" 456 | 457 | @@ -2075,24 +2077,16 @@ out_frames: 458 | } 459 | 460 | /* 461 | - * ext4_add_entry() 462 | - * 463 | - * adds a file entry to the specified directory, using the same 464 | - * semantics as ext4_find_entry(). It returns NULL if it failed. 465 | - * 466 | - * NOTE!! The inode part of 'de' is left at 0 - which means you 467 | - * may not sleep between calling this and putting something into 468 | - * the entry, as someone else might have used it while you slept. 469 | + * Add a directory entry to a directory, given the filename and the 470 | + * inode it will point to. 471 | */ 472 | -static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 473 | - struct inode *inode) 474 | +static int ext4_add_fname(handle_t *handle, struct inode *dir, 475 | + struct ext4_filename *fname, struct inode *inode) 476 | { 477 | - struct inode *dir = d_inode(dentry->d_parent); 478 | struct buffer_head *bh = NULL; 479 | struct ext4_dir_entry_2 *de; 480 | struct ext4_dir_entry_tail *t; 481 | struct super_block *sb; 482 | - struct ext4_filename fname; 483 | int retval; 484 | int dx_fallback=0; 485 | unsigned blocksize; 486 | @@ -2104,15 +2098,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 487 | 488 | sb = dir->i_sb; 489 | blocksize = sb->s_blocksize; 490 | - if (!dentry->d_name.len) 491 | - return -EINVAL; 492 | - 493 | - retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); 494 | - if (retval) 495 | - return retval; 496 | 497 | if (ext4_has_inline_data(dir)) { 498 | - retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); 499 | + retval = ext4_try_add_inline_entry(handle, fname, dir, inode); 500 | if (retval < 0) 501 | goto out; 502 | if (retval == 1) { 503 | @@ -2122,7 +2110,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 504 | } 505 | 506 | if (is_dx(dir)) { 507 | - retval = ext4_dx_add_entry(handle, &fname, dir, inode); 508 | + retval = ext4_dx_add_entry(handle, fname, dir, inode); 509 | if (!retval || (retval != ERR_BAD_DX_DIR)) 510 | goto out; 511 | ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); 512 | @@ -2137,14 +2125,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 513 | bh = NULL; 514 | goto out; 515 | } 516 | - retval = add_dirent_to_buf(handle, &fname, dir, inode, 517 | + retval = add_dirent_to_buf(handle, fname, dir, inode, 518 | NULL, bh); 519 | if (retval != -ENOSPC) 520 | goto out; 521 | 522 | if (blocks == 1 && !dx_fallback && 523 | ext4_has_feature_dir_index(sb)) { 524 | - retval = make_indexed_dir(handle, &fname, dir, 525 | + retval = make_indexed_dir(handle, fname, dir, 526 | inode, bh); 527 | bh = NULL; /* make_indexed_dir releases bh */ 528 | goto out; 529 | @@ -2166,9 +2154,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 530 | initialize_dirent_tail(t, blocksize); 531 | } 532 | 533 | - retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); 534 | + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); 535 | out: 536 | - ext4_fname_free_filename(&fname); 537 | brelse(bh); 538 | if (retval == 0) 539 | ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); 540 | @@ -2176,6 +2163,29 @@ out: 541 | } 542 | 543 | /* 544 | + * Create a directory entry associated with the specified dentry and 545 | + * inode. 546 | + */ 547 | +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 548 | + struct inode *inode) 549 | +{ 550 | + struct inode *dir = d_inode(dentry->d_parent); 551 | + struct ext4_filename fname; 552 | + int retval; 553 | + 554 | + if (!dentry->d_name.len) 555 | + return -EINVAL; 556 | + 557 | + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); 558 | + if (retval) 559 | + return retval; 560 | + 561 | + retval = ext4_add_fname(handle, dir, &fname, inode); 562 | + ext4_fname_free_filename(&fname); 563 | + return retval; 564 | +} 565 | + 566 | +/* 567 | * Returns 0 for success, or a negative error value 568 | */ 569 | static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, 570 | @@ -3905,3 +3915,252 @@ const struct inode_operations ext4_special_inode_operations = { 571 | .get_acl = ext4_get_acl, 572 | .set_acl = ext4_set_acl, 573 | }; 574 | + 575 | +int ext4_get_encrypted_filename(struct file *filp, 576 | + struct ext4_rw_enc_mdata *mdata) 577 | +{ 578 | + unsigned char *cp = mdata->buf; 579 | + struct dentry *dentry = filp->f_path.dentry; 580 | + struct inode *inode = file_inode(filp); 581 | + struct inode *dir = dentry->d_parent->d_inode; 582 | + struct buffer_head *bh; 583 | + struct ext4_dir_entry_2 *de; 584 | + int isdir = S_ISDIR(inode->i_mode); 585 | + int len = isdir ? 10 : 4; 586 | + int ret; 587 | + 588 | + if (!dir || !ext4_encrypted_inode(dir)) 589 | + return -EINVAL; 590 | + 591 | + if (!inode_owner_or_capable(dir) && !capable(CAP_SYS_ADMIN)) 592 | + return -EACCES; 593 | + 594 | + if (mdata->u.len < len) 595 | + return -ENOSPC; 596 | + 597 | + *cp++ = 'e'; 598 | + *cp++ = isdir ? 'd' : 'f'; 599 | + *cp++ = 0; 600 | + *cp++ = 0; 601 | + 602 | + if (isdir) { 603 | + *((u32 *)cp) = cpu_to_le32(inode->i_mode); 604 | + cp += 4; 605 | + ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, 606 | + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 607 | + NULL, 0); 608 | + if (ret < 0) 609 | + return ret; 610 | + *((u16 *)cp) = cpu_to_le16((u16) ret); 611 | + cp += 2; 612 | + 613 | + len += ret; 614 | + if (mdata->u.len < len) 615 | + return -ENOSPC; 616 | + ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, 617 | + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 618 | + cp, ret); 619 | + if (ret < 0) 620 | + return ret; 621 | + cp += ret; 622 | + } 623 | + 624 | + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 625 | + if (IS_ERR(bh)) 626 | + return PTR_ERR(bh); 627 | + if (de == NULL) 628 | + return -ENOENT; 629 | + 630 | + len += de->name_len; 631 | + if (mdata->u.len < len) 632 | + return -ENOSPC; 633 | + 634 | + mdata->u.len = len; 635 | + memcpy(cp, de->name, de->name_len); 636 | + *((u16 *) &mdata->buf[2]) = cpu_to_le16(crc16(~0, mdata->buf, 637 | + mdata->u.len)); 638 | + return 0; 639 | +} 640 | + 641 | +int ext4_set_encrypted_filename(struct inode *dir, 642 | + struct ext4_rw_enc_mdata *mdata) 643 | +{ 644 | + struct ext4_encryption_context *ctx = NULL; 645 | + struct ext4_filename fname; 646 | + unsigned char *cp = mdata->buf; 647 | + struct inode *inode = NULL; 648 | + struct fd fd; 649 | + handle_t *handle = NULL; 650 | + umode_t mode; 651 | + u16 crc, xlen, credits; 652 | + int retval = 0, retries = 0, do_retry = 0; 653 | + int len = mdata->u.len; 654 | + 655 | + if (!dir || !ext4_encrypted_inode(dir)) 656 | + return -EINVAL; 657 | + 658 | + retval = inode_permission(dir, MAY_WRITE | MAY_EXEC); 659 | + if (retval) 660 | + return retval; 661 | + 662 | + if (len < 4) 663 | + return -EINVAL; 664 | + 665 | + if (cp[0] != 'e' || 666 | + cp[1] != ((mdata->u.fd == -1) ? 'd' : 'f')) 667 | + return -EINVAL; 668 | + crc = le16_to_cpu(*(u16 *)(cp+2)); 669 | + cp[2] = cp[3] = 0; 670 | + cp += 4; len -= 4; 671 | + 672 | + if (crc != crc16(~0, mdata->buf, mdata->u.len)) 673 | + return -EINVAL; 674 | + 675 | + if ((len < EXT4_CRYPTO_BLOCK_SIZE) || (len > EXT4_NAME_LEN + 1)) 676 | + return -EINVAL; 677 | + 678 | + retval = dquot_initialize(dir); 679 | + if (retval) 680 | + return retval; 681 | + 682 | + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 683 | + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 684 | + 685 | + if (mdata->u.fd >= 0) { 686 | + fd = fdget(mdata->u.fd); 687 | + if (!fd.file) 688 | + return -EBADF; 689 | + inode = file_inode(fd.file); 690 | + mode = inode->i_mode; 691 | + retval = -EISDIR; 692 | + if (S_ISDIR(mode)) 693 | + goto out; 694 | + } else if (mdata->u.fd == -1) { 695 | + /* do an encrypted mkdir */ 696 | + fd.file = NULL; 697 | + if (EXT4_DIR_LINK_MAX(dir)) 698 | + return -EMLINK; 699 | + if (len < 6) 700 | + return -EINVAL; 701 | + mode = le32_to_cpu(*(u32 *)cp); 702 | + cp += 4; 703 | + xlen = le16_to_cpu(*(u16 *)cp); 704 | + cp += 2; len -= 6; 705 | + 706 | + if (len < xlen || 707 | + xlen != sizeof(struct ext4_encryption_context)) 708 | + return -EINVAL; 709 | + 710 | + ctx = (struct ext4_encryption_context *) cp; 711 | + retval = ext4_validate_encryption_context(ctx); 712 | + if (retval) 713 | + return retval; 714 | + cp += xlen; len -= xlen; 715 | + 716 | + /* credits for the mkdir and xattr set */ 717 | + credits += (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 718 | + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 719 | + ext4_jbd2_credits_xattr(dir)); 720 | + retry: 721 | + inode = ext4_new_inode_start_handle_flags(dir, mode, NULL, 0, 722 | + NULL, EXT4_HT_DIR, credits, 723 | + EXT4_NEW_INODE_NOENCRYPT); 724 | + handle = ext4_journal_current_handle(); 725 | + if (IS_ERR(inode)) { 726 | + retval = PTR_ERR(inode); 727 | + inode = NULL; 728 | + goto out; 729 | + } 730 | + inode->i_op = &ext4_dir_inode_operations; 731 | + inode->i_fop = &ext4_dir_operations; 732 | + retval = ext4_init_new_dir(handle, dir, inode); 733 | + if (retval) 734 | + goto out; 735 | + 736 | + retval = ext4_xattr_set_handle(handle, inode, 737 | + EXT4_XATTR_INDEX_ENCRYPTION, 738 | + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, 739 | + sizeof(struct ext4_encryption_context), 740 | + fd.file ? XATTR_REPLACE : XATTR_CREATE); 741 | + if (retval) 742 | + goto out; 743 | + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 744 | + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 745 | + 746 | + goto insert_fname; 747 | + } else 748 | + return -EINVAL; 749 | + 750 | + 751 | + if ((mode & S_ISUID) || 752 | + ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) { 753 | + /* 754 | + * root or the inode owner can link even in the case 755 | + * of "unsafe" hard link sources. See 756 | + * safe_hardlink_sources() in fs/namei.c 757 | + */ 758 | + if (!inode_owner_or_capable(inode) && !capable(CAP_SYS_ADMIN)) { 759 | + retval = -EACCES; 760 | + goto out; 761 | + } 762 | + } 763 | + 764 | + retval = inode_permission(inode, MAY_READ | MAY_WRITE); 765 | + if (!retval && !inode_owner_or_capable(inode) && 766 | + !capable(CAP_SYS_ADMIN)) 767 | + goto out; 768 | + 769 | + handle = ext4_journal_start(dir, EXT4_HT_DIR, 770 | + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 771 | + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 2); 772 | + if (IS_ERR(handle)) { 773 | + retval = PTR_ERR(handle); 774 | + goto out; 775 | + } 776 | + 777 | +insert_fname: 778 | + if (!ext4_is_child_context_consistent_with_parent(dir, inode)) { 779 | + retval = -EPERM; 780 | + goto out; 781 | + } 782 | + 783 | + memset(&fname, 0, sizeof(fname)); 784 | + fname.disk_name.name = cp; 785 | + fname.disk_name.len = len; 786 | + retval = ext4_add_fname(handle, dir, &fname, inode); 787 | + if (retval) 788 | + goto out; 789 | + 790 | + if (fd.file) 791 | + ext4_inc_count(handle, inode); 792 | + ext4_mark_inode_dirty(handle, inode); 793 | + if (!fd.file) 794 | + ext4_inc_count(handle, dir); 795 | + ext4_update_dx_flag(dir); 796 | + ext4_mark_inode_dirty(handle, dir); 797 | + if (fd.file == NULL) { 798 | + unlock_new_inode(inode); 799 | + iput(inode); 800 | + } 801 | + 802 | +out: 803 | + if (fd.file) 804 | + fdput(fd); 805 | + else if (retval && inode && (mdata->u.fd == -1)) { 806 | + /* need to undo a failed attempted mkdir */ 807 | + clear_nlink(inode); 808 | + unlock_new_inode(inode); 809 | + ext4_mark_inode_dirty(handle, inode); 810 | + iput(inode); 811 | + if (retval == -ENOSPC && 812 | + ext4_should_retry_alloc(dir->i_sb, &retries)) 813 | + do_retry++; 814 | + } 815 | + if (handle) 816 | + ext4_journal_stop(handle); 817 | + if (do_retry) { 818 | + do_retry = 0; 819 | + goto retry; 820 | + } 821 | + return retval; 822 | +} 823 | -------------------------------------------------------------------------------- /old-patches/crypto-rename-ext4_get_encryption_info: -------------------------------------------------------------------------------- 1 | --- 2 | fs/ext4/crypto.c | 15 +++++++++++---- 3 | fs/ext4/crypto_fname.c | 64 +++++++++++++++++++++++++++++++++++++++++----------------------- 4 | fs/ext4/crypto_key.c | 2 +- 5 | fs/ext4/crypto_policy.c | 43 ++++++++++++++++++++++--------------------- 6 | fs/ext4/dir.c | 8 ++++++-- 7 | fs/ext4/ext4.h | 37 ++++++++++++++++++++++--------------- 8 | fs/ext4/file.c | 12 +++++------- 9 | fs/ext4/ialloc.c | 6 +++--- 10 | fs/ext4/namei.c | 17 +++-------------- 11 | fs/ext4/symlink.c | 4 ---- 12 | 10 files changed, 114 insertions(+), 94 deletions(-) 13 | 14 | diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c 15 | index c802120..e38908d 100644 16 | --- a/fs/ext4/crypto.c 17 | +++ b/fs/ext4/crypto.c 18 | @@ -98,9 +98,8 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) 19 | struct ext4_crypto_ctx *ctx = NULL; 20 | int res = 0; 21 | unsigned long flags; 22 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 23 | 24 | - if (ci == NULL) 25 | + if (EXT4_I(inode)->i_crypt_info == NULL) 26 | return ERR_PTR(-ENOKEY); 27 | 28 | /* 29 | @@ -264,10 +263,18 @@ static int ext4_page_crypto(struct inode *inode, 30 | struct ablkcipher_request *req = NULL; 31 | DECLARE_EXT4_COMPLETION_RESULT(ecr); 32 | struct scatterlist dst, src; 33 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 34 | - struct crypto_ablkcipher *tfm = ci->ci_ctfm; 35 | + struct ext4_crypt_info *ci; 36 | + struct crypto_ablkcipher *tfm; 37 | int res = 0; 38 | 39 | + rcu_read_lock(); 40 | + ci = EXT4_I(inode)->i_crypt_info; 41 | + if (ci == NULL) { 42 | + rcu_read_unlock(); 43 | + return -ENOKEY; 44 | + } 45 | + tfm = ci->ci_ctfm; 46 | + 47 | req = ablkcipher_request_alloc(tfm, GFP_NOFS); 48 | if (!req) { 49 | printk_ratelimited(KERN_ERR 50 | diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c 51 | index 2fbef8a..db2d134 100644 52 | --- a/fs/ext4/crypto_fname.c 53 | +++ b/fs/ext4/crypto_fname.c 54 | @@ -61,13 +61,13 @@ static unsigned max_name_len(struct inode *inode) 55 | * allocate sufficient memory to oname string. 56 | */ 57 | static int ext4_fname_encrypt(struct inode *inode, 58 | + struct ext4_crypt_info *ci, 59 | const struct qstr *iname, 60 | struct ext4_str *oname) 61 | { 62 | u32 ciphertext_len; 63 | struct ablkcipher_request *req = NULL; 64 | DECLARE_EXT4_COMPLETION_RESULT(ecr); 65 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 66 | struct crypto_ablkcipher *tfm = ci->ci_ctfm; 67 | int res = 0; 68 | char iv[EXT4_CRYPTO_BLOCK_SIZE]; 69 | @@ -141,6 +141,7 @@ static int ext4_fname_encrypt(struct inode *inode, 70 | * We trust the caller to allocate sufficient memory to oname string. 71 | */ 72 | static int ext4_fname_decrypt(struct inode *inode, 73 | + struct ext4_crypt_info *ci, 74 | const struct ext4_str *iname, 75 | struct ext4_str *oname) 76 | { 77 | @@ -148,7 +149,6 @@ static int ext4_fname_decrypt(struct inode *inode, 78 | struct ablkcipher_request *req = NULL; 79 | DECLARE_EXT4_COMPLETION_RESULT(ecr); 80 | struct scatterlist src_sg, dst_sg; 81 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 82 | struct crypto_ablkcipher *tfm = ci->ci_ctfm; 83 | int res = 0; 84 | char iv[EXT4_CRYPTO_BLOCK_SIZE]; 85 | @@ -261,11 +261,13 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) 86 | 87 | unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) 88 | { 89 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 90 | + struct ext4_crypt_info *ci; 91 | int padding = 32; 92 | 93 | - if (ci) 94 | + ci = ext4_get_crypt_info_rcu(inode); 95 | + if (!IS_ERR(ci) && ci) 96 | padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); 97 | + rcu_read_unlock(); 98 | if (ilen < EXT4_CRYPTO_BLOCK_SIZE) 99 | ilen = EXT4_CRYPTO_BLOCK_SIZE; 100 | return ext4_fname_crypto_round_up(ilen, padding); 101 | @@ -316,6 +318,7 @@ int _ext4_fname_disk_to_usr(struct inode *inode, 102 | { 103 | char buf[24]; 104 | int ret; 105 | + struct ext4_crypt_info *ci; 106 | 107 | if (iname->len < 3) { 108 | /*Check for . and .. */ 109 | @@ -330,8 +333,15 @@ int _ext4_fname_disk_to_usr(struct inode *inode, 110 | EXT4_ERROR_INODE(inode, "encrypted inode too small"); 111 | return -EUCLEAN; 112 | } 113 | - if (EXT4_I(inode)->i_crypt_info) 114 | - return ext4_fname_decrypt(inode, iname, oname); 115 | + ci = ext4_get_crypt_info_rcu(inode); 116 | + if (IS_ERR(ci)) 117 | + return PTR_ERR(ci); 118 | + if (ci) { 119 | + int ret = ext4_fname_decrypt(inode, ci, iname, oname); 120 | + 121 | + rcu_read_unlock(); 122 | + return ret; 123 | + } 124 | 125 | if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { 126 | ret = digest_encode(iname->name, iname->len, oname->name); 127 | @@ -369,8 +379,7 @@ int ext4_fname_usr_to_disk(struct inode *inode, 128 | const struct qstr *iname, 129 | struct ext4_str *oname) 130 | { 131 | - int res; 132 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 133 | + struct ext4_crypt_info *ci; 134 | 135 | if (iname->len < 3) { 136 | /*Check for . and .. */ 137 | @@ -382,8 +391,13 @@ int ext4_fname_usr_to_disk(struct inode *inode, 138 | return oname->len; 139 | } 140 | } 141 | + ci = ext4_get_crypt_info_rcu(inode); 142 | + if (IS_ERR(ci)) 143 | + return PTR_ERR(ci); 144 | if (ci) { 145 | - res = ext4_fname_encrypt(inode, iname, oname); 146 | + int res = ext4_fname_encrypt(inode, ci, iname, oname); 147 | + 148 | + rcu_read_unlock(); 149 | return res; 150 | } 151 | /* Without a proper key, a user is not allowed to modify the filenames 152 | @@ -409,24 +423,29 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, 153 | fname->disk_name.len = iname->len; 154 | return 0; 155 | } 156 | - ret = ext4_get_encryption_info(dir); 157 | + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, 158 | + &fname->crypto_buf); 159 | if (ret) 160 | return ret; 161 | - ci = EXT4_I(dir)->i_crypt_info; 162 | + 163 | + ci = ext4_get_crypt_info_rcu(dir); 164 | + if (IS_ERR(ci)) { 165 | + ret = PTR_ERR(ci); 166 | + goto errout; 167 | + } 168 | if (ci) { 169 | - ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, 170 | - &fname->crypto_buf); 171 | - if (ret < 0) 172 | - return ret; 173 | - ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); 174 | + ret = ext4_fname_encrypt(dir, ci, iname, &fname->crypto_buf); 175 | + rcu_read_unlock(); 176 | if (ret < 0) 177 | goto errout; 178 | fname->disk_name.name = fname->crypto_buf.name; 179 | fname->disk_name.len = fname->crypto_buf.len; 180 | return 0; 181 | } 182 | - if (!lookup) 183 | - return -EACCES; 184 | + if (!lookup) { 185 | + ret = -EACCES; 186 | + goto errout; 187 | + } 188 | 189 | /* We don't have the key and we are doing a lookup; decode the 190 | * user-supplied name 191 | @@ -434,12 +453,11 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, 192 | if (iname->name[0] == '_') 193 | bigname = 1; 194 | if ((bigname && (iname->len != 33)) || 195 | - (!bigname && (iname->len > 43))) 196 | - return -ENOENT; 197 | + (!bigname && (iname->len > 43))) { 198 | + ret = -ENOENT; 199 | + goto errout; 200 | + } 201 | 202 | - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); 203 | - if (fname->crypto_buf.name == NULL) 204 | - return -ENOMEM; 205 | ret = digest_decode(iname->name + bigname, iname->len - bigname, 206 | fname->crypto_buf.name); 207 | if (ret < 0) { 208 | diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c 209 | index 9a16d1e..d9b4cc0 100644 210 | --- a/fs/ext4/crypto_key.c 211 | +++ b/fs/ext4/crypto_key.c 212 | @@ -111,7 +111,7 @@ void ext4_free_encryption_info(struct inode *inode, 213 | ext4_free_crypt_info(ci); 214 | } 215 | 216 | -int _ext4_get_encryption_info(struct inode *inode) 217 | +int ext4_setup_encryption_info(struct inode *inode) 218 | { 219 | struct ext4_inode_info *ei = EXT4_I(inode); 220 | struct ext4_crypt_info *crypt_info; 221 | diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c 222 | index ad05069..2f5743f 100644 223 | --- a/fs/ext4/crypto_policy.c 224 | +++ b/fs/ext4/crypto_policy.c 225 | @@ -159,25 +159,26 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, 226 | /* if the child directory is not encrypted, this is always a problem */ 227 | if (!ext4_encrypted_inode(child)) 228 | return 0; 229 | - res = ext4_get_encryption_info(parent); 230 | - if (res) 231 | + parent_ci = ext4_get_crypt_info_rcu(parent); 232 | + if (IS_ERR(parent_ci)) 233 | return 0; 234 | - res = ext4_get_encryption_info(child); 235 | - if (res) 236 | + child_ci = ext4_get_crypt_info_rcu(child); 237 | + if (IS_ERR(child_ci)) 238 | return 0; 239 | - parent_ci = EXT4_I(parent)->i_crypt_info; 240 | - child_ci = EXT4_I(child)->i_crypt_info; 241 | if (!parent_ci && !child_ci) 242 | - return 1; 243 | - if (!parent_ci || !child_ci) 244 | - return 0; 245 | - 246 | - return (memcmp(parent_ci->ci_master_key, 247 | - child_ci->ci_master_key, 248 | - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && 249 | - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && 250 | - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && 251 | - (parent_ci->ci_flags == child_ci->ci_flags)); 252 | + res = 1; 253 | + else if (!parent_ci || !child_ci) 254 | + res = 0; 255 | + else 256 | + res = (memcmp(parent_ci->ci_master_key, 257 | + child_ci->ci_master_key, 258 | + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && 259 | + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && 260 | + (parent_ci->ci_filename_mode == 261 | + child_ci->ci_filename_mode) && 262 | + (parent_ci->ci_flags == child_ci->ci_flags)); 263 | + rcu_read_unlock(); 264 | + return res; 265 | } 266 | 267 | /** 268 | @@ -193,10 +194,9 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) 269 | struct ext4_crypt_info *ci; 270 | int res; 271 | 272 | - res = ext4_get_encryption_info(parent); 273 | - if (res < 0) 274 | - return res; 275 | - ci = EXT4_I(parent)->i_crypt_info; 276 | + ci = ext4_get_crypt_info_rcu(parent); 277 | + if (IS_ERR(ci)) 278 | + return PTR_ERR(ci); 279 | if (ci == NULL) 280 | return -ENOKEY; 281 | 282 | @@ -216,6 +216,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) 283 | memcpy(ctx.master_key_descriptor, ci->ci_master_key, 284 | EXT4_KEY_DESCRIPTOR_SIZE); 285 | } 286 | + rcu_read_unlock(); 287 | get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); 288 | res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, 289 | EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, 290 | @@ -223,7 +224,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) 291 | if (!res) { 292 | ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); 293 | ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); 294 | - res = ext4_get_encryption_info(child); 295 | + res = ext4_setup_encryption_info(child); 296 | } 297 | return res; 298 | } 299 | diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c 300 | index 1d1bca7..f4dba17 100644 301 | --- a/fs/ext4/dir.c 302 | +++ b/fs/ext4/dir.c 303 | @@ -594,8 +594,12 @@ finished: 304 | 305 | static int ext4_dir_open(struct inode * inode, struct file * filp) 306 | { 307 | - if (ext4_encrypted_inode(inode)) 308 | - return ext4_get_encryption_info(inode) ? -EACCES : 0; 309 | + if (ext4_encrypted_inode(inode)) { 310 | + int ret = ext4_setup_encryption_info(inode); 311 | + 312 | + if (ret && ret != -ENOKEY) 313 | + return -EACCES; 314 | + } 315 | return 0; 316 | } 317 | 318 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 319 | index edbe347..16ca80d 100644 320 | --- a/fs/ext4/ext4.h 321 | +++ b/fs/ext4/ext4.h 322 | @@ -2327,39 +2327,46 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } 323 | /* crypto_key.c */ 324 | void ext4_free_crypt_info(struct ext4_crypt_info *ci); 325 | void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); 326 | -int _ext4_get_encryption_info(struct inode *inode); 327 | +int ext4_setup_encryption_info(struct inode *inode); 328 | 329 | #ifdef CONFIG_EXT4_FS_ENCRYPTION 330 | int ext4_has_encryption_key(struct inode *inode); 331 | 332 | -static inline int ext4_get_encryption_info(struct inode *inode) 333 | +static inline 334 | +struct ext4_crypt_info *ext4_get_crypt_info_rcu(struct inode *inode) 335 | { 336 | - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; 337 | + int ret; 338 | + struct ext4_crypt_info *ci; 339 | 340 | + rcu_read_lock(); 341 | + ci = EXT4_I(inode)->i_crypt_info; 342 | if (!ci || 343 | (ci->ci_keyring_key && 344 | (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | 345 | (1 << KEY_FLAG_REVOKED) | 346 | - (1 << KEY_FLAG_DEAD))))) 347 | - return _ext4_get_encryption_info(inode); 348 | - return 0; 349 | -} 350 | - 351 | -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) 352 | -{ 353 | - return EXT4_I(inode)->i_crypt_info; 354 | + (1 << KEY_FLAG_DEAD))))) { 355 | + rcu_read_unlock(); 356 | + ret = ext4_setup_encryption_info(inode); 357 | + if (ret && ret != -ENOKEY) { 358 | + return ERR_PTR(ret); 359 | + } 360 | + rcu_read_lock(); 361 | + } 362 | + ci = EXT4_I(inode)->i_crypt_info; 363 | + if (ci == NULL) 364 | + rcu_read_unlock(); 365 | + return ci; 366 | } 367 | - 368 | #else 369 | static inline int ext4_has_encryption_key(struct inode *inode) 370 | { 371 | return 0; 372 | } 373 | -static inline int ext4_get_encryption_info(struct inode *inode) 374 | +static inline int ext4_setup_encryption_info(struct inode *inode) 375 | { 376 | - return 0; 377 | + return -ENOKEY; 378 | } 379 | -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) 380 | +struct ext4_crypt_info *ext4_get_crypt_info_rcu(struct inode *inode) 381 | { 382 | return NULL; 383 | } 384 | diff --git a/fs/ext4/file.c b/fs/ext4/file.c 385 | index 749b222..c977c7a 100644 386 | --- a/fs/ext4/file.c 387 | +++ b/fs/ext4/file.c 388 | @@ -327,11 +327,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 389 | struct inode *inode = file->f_mapping->host; 390 | 391 | if (ext4_encrypted_inode(inode)) { 392 | - int err = ext4_get_encryption_info(inode); 393 | + int err = ext4_setup_encryption_info(inode); 394 | if (err) 395 | - return 0; 396 | - if (ext4_encryption_info(inode) == NULL) 397 | - return -ENOKEY; 398 | + return err; 399 | } 400 | file_accessed(file); 401 | if (IS_DAX(file_inode(file))) { 402 | @@ -385,11 +383,11 @@ static int ext4_file_open(struct inode * inode, struct file * filp) 403 | } 404 | } 405 | if (ext4_encrypted_inode(inode)) { 406 | - ret = ext4_get_encryption_info(inode); 407 | + ret = ext4_setup_encryption_info(inode); 408 | + if (ret == -ENOKEY) 409 | + return ret; 410 | if (ret) 411 | return -EACCES; 412 | - if (ext4_encryption_info(inode) == NULL) 413 | - return -ENOKEY; 414 | } 415 | /* 416 | * Set up the jbd2_inode if we are opening the inode for 417 | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c 418 | index 3fcfd50..3abf659 100644 419 | --- a/fs/ext4/ialloc.c 420 | +++ b/fs/ext4/ialloc.c 421 | @@ -765,11 +765,11 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, 422 | if ((ext4_encrypted_inode(dir) || 423 | DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && 424 | (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 425 | - err = ext4_get_encryption_info(dir); 426 | + err = ext4_setup_encryption_info(dir); 427 | + if (err == -ENOKEY) 428 | + return ERR_PTR(-EPERM); 429 | if (err) 430 | return ERR_PTR(err); 431 | - if (ext4_encryption_info(dir) == NULL) 432 | - return ERR_PTR(-EPERM); 433 | if (!handle) 434 | nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); 435 | encrypt = 1; 436 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c 437 | index 2047ff7..9992a22 100644 438 | --- a/fs/ext4/namei.c 439 | +++ b/fs/ext4/namei.c 440 | @@ -618,12 +618,6 @@ static struct stats dx_show_leaf(struct inode *dir, 441 | 442 | name = de->name; 443 | len = de->name_len; 444 | - if (ext4_encrypted_inode(inode)) 445 | - res = ext4_get_encryption_info(dir); 446 | - if (res) { 447 | - printk(KERN_WARNING "Error setting up" 448 | - " fname crypto: %d\n", res); 449 | - } 450 | if (ctx == NULL) { 451 | /* Directory is not encrypted */ 452 | ext4fs_dirhash(de->name, 453 | @@ -967,11 +961,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, 454 | #ifdef CONFIG_EXT4_FS_ENCRYPTION 455 | /* Check if the directory is encrypted */ 456 | if (ext4_encrypted_inode(dir)) { 457 | - err = ext4_get_encryption_info(dir); 458 | - if (err < 0) { 459 | - brelse(bh); 460 | - return err; 461 | - } 462 | err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, 463 | &fname_crypto_str); 464 | if (err < 0) { 465 | @@ -3058,11 +3047,11 @@ static int ext4_symlink(struct inode *dir, 466 | encryption_required = (ext4_encrypted_inode(dir) || 467 | DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); 468 | if (encryption_required) { 469 | - err = ext4_get_encryption_info(dir); 470 | + err = ext4_setup_encryption_info(dir); 471 | + if (err == -ENOKEY) 472 | + return -EPERM; 473 | if (err) 474 | return err; 475 | - if (ext4_encryption_info(dir) == NULL) 476 | - return -EPERM; 477 | disk_link.len = (ext4_fname_encrypted_size(dir, len) + 478 | sizeof(struct ext4_encrypted_symlink_data)); 479 | sd = kzalloc(disk_link.len, GFP_KERNEL); 480 | diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c 481 | index e8e7af6..a9b5777 100644 482 | --- a/fs/ext4/symlink.c 483 | +++ b/fs/ext4/symlink.c 484 | @@ -34,10 +34,6 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook 485 | int res; 486 | u32 plen, max_size = inode->i_sb->s_blocksize; 487 | 488 | - res = ext4_get_encryption_info(inode); 489 | - if (res) 490 | - return ERR_PTR(res); 491 | - 492 | if (ext4_inode_is_fast_symlink(inode)) { 493 | caddr = (char *) EXT4_I(inode)->i_data; 494 | max_size = sizeof(EXT4_I(inode)->i_data); 495 | -------------------------------------------------------------------------------- /old-patches/delalloc-debug: -------------------------------------------------------------------------------- 1 | ext4: add delalloc debugging 2 | 3 | This adds a file in /proc/fs/ext4/ which when opened for reading, 4 | will trigger debugging code that dumps a lot of information about 5 | inodes subject to delayed allocation to the console. 6 | 7 | Signed-off-by: "Theodore Ts'o" 8 | --- 9 | fs/ext4/sysfs.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 10 | 1 file changed, 69 insertions(+) 11 | 12 | diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c 13 | index 9d29723..507d33f 100644 14 | --- a/fs/ext4/sysfs.c 15 | +++ b/fs/ext4/sysfs.c 16 | @@ -371,6 +371,74 @@ static struct kobject ext4_feat = { 17 | .kset = &ext4_kset, 18 | }; 19 | 20 | +#ifdef CONFIG_EXT4_DEBUG 21 | +static void print_inode_delalloc_info(struct inode *inode) 22 | +{ 23 | + if (!EXT4_I(inode)->i_reserved_data_blocks || 24 | + !EXT4_I(inode)->i_reserved_meta_blocks) 25 | + return; 26 | + 27 | + printk(KERN_DEBUG "ino %lu: %u %u\n", inode->i_ino, 28 | + EXT4_I(inode)->i_reserved_data_blocks, 29 | + EXT4_I(inode)->i_reserved_meta_blocks); 30 | +} 31 | + 32 | +static int debug_delalloc_show(struct seq_file *seq, void *offset) 33 | +{ 34 | + return 0; 35 | +} 36 | + 37 | +static int options_delalloc_debug_open_fs(struct inode *proc_inode, 38 | + struct file *file) 39 | +{ 40 | + struct super_block *sb = PDE_DATA(proc_inode); 41 | + struct ext4_sb_info *sbi = EXT4_SB(sb); 42 | + struct inode *inode; 43 | + extern spinlock_t inode_sb_list_lock; 44 | + 45 | + printk(KERN_DEBUG "EXT4-fs debug delalloc of %s\n", sb->s_id); 46 | + printk(KERN_DEBUG "EXT4-fs: dirty clusters %lld free clusters %lld\n", 47 | + percpu_counter_sum(&sbi->s_dirtyclusters_counter), 48 | + percpu_counter_sum(&sbi->s_freeclusters_counter)); 49 | + 50 | +#ifndef MODULE 51 | + spin_lock(&inode_sb_list_lock); 52 | + if (!list_empty(&sb->s_bdi->wb.b_dirty)) { 53 | + printk(KERN_DEBUG "s_bdi->wb.b_dirty list:\n"); 54 | + list_for_each_entry(inode, &sb->s_bdi->wb.b_dirty, 55 | + i_io_list) { 56 | + print_inode_delalloc_info(inode); 57 | + } 58 | + } 59 | + if (!list_empty(&sb->s_bdi->wb.b_io)) { 60 | + printk(KERN_DEBUG "s_bdi->wb.b_io list:\n"); 61 | + list_for_each_entry(inode, &sb->s_bdi->wb.b_io, 62 | + i_io_list) { 63 | + print_inode_delalloc_info(inode); 64 | + } 65 | + } 66 | + if (!list_empty(&sb->s_bdi->wb.b_more_io)) { 67 | + printk(KERN_DEBUG "s_bdi->wb.b_more_io list:\n"); 68 | + list_for_each_entry(inode, &sb->s_bdi->wb.b_more_io, 69 | + i_io_list) { 70 | + print_inode_delalloc_info(inode); 71 | + } 72 | + } 73 | + spin_unlock(&inode_sb_list_lock); 74 | + printk(KERN_DEBUG "ext4 debug delalloc done\n"); 75 | +#endif 76 | + return single_open(file, debug_delalloc_show, sb); 77 | +} 78 | + 79 | +static const struct file_operations ext4_seq_delalloc_debug_fops = { 80 | + .owner = THIS_MODULE, 81 | + .open = options_delalloc_debug_open_fs, 82 | + .read = seq_read, 83 | + .llseek = seq_lseek, 84 | + .release = single_release, 85 | +}; 86 | +#endif 87 | + 88 | #define PROC_FILE_SHOW_DEFN(name) \ 89 | static int name##_open(struct inode *inode, struct file *file) \ 90 | { \ 91 | @@ -398,6 +466,7 @@ static struct ext4_proc_files { 92 | PROC_FILE_LIST(options), 93 | PROC_FILE_LIST(es_shrinker_info), 94 | PROC_FILE_LIST(mb_groups), 95 | + PROC_FILE_LIST(delalloc_debug), 96 | { NULL, NULL }, 97 | }; 98 | 99 | -------------------------------------------------------------------------------- /old-patches/dont-use-io-end-if-not-needed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tytso/ext4-patch-queue/fc40a5a6ccf08de7a0119c7f52759a33909a7177/old-patches/dont-use-io-end-if-not-needed -------------------------------------------------------------------------------- /old-patches/dump-in-use-buffers: -------------------------------------------------------------------------------- 1 | Add a ioctl which dumps out all of the in-use buffer heads for a block device 2 | 3 | Signed-off-by: "Theodore Ts'o" 4 | --- 5 | block/compat_ioctl.c | 4 ++++ 6 | block/ioctl.c | 11 +++++++++++ 7 | fs/buffer.c | 40 ++++++++++++++++++++++++++++++++++++++++ 8 | 3 files changed, 55 insertions(+), 0 deletions(-) 9 | 10 | diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c 11 | index 4eb8e9e..2535cb1 100644 12 | --- a/block/compat_ioctl.c 13 | +++ b/block/compat_ioctl.c 14 | @@ -11,6 +11,9 @@ 15 | #include 16 | #include 17 | 18 | +/* For debugging purposes; see block/ioctl.c */ 19 | +#define BLKDUMPUSEDBUFFERS _IO(0x12,130) 20 | + 21 | static int compat_put_ushort(unsigned long arg, unsigned short val) 22 | { 23 | return put_user(val, (unsigned short __user *)compat_ptr(arg)); 24 | @@ -749,6 +752,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) 25 | return compat_put_int(arg, bdev_alignment_offset(bdev)); 26 | case BLKDISCARDZEROES: 27 | return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); 28 | + case BLKDUMPUSEDBUFFERS: 29 | case BLKFLSBUF: 30 | case BLKROSET: 31 | case BLKDISCARD: 32 | diff --git a/block/ioctl.c b/block/ioctl.c 33 | index be48ea5..8cc002b 100644 34 | --- a/block/ioctl.c 35 | +++ b/block/ioctl.c 36 | @@ -8,6 +8,10 @@ 37 | #include 38 | #include 39 | 40 | +/* For debugging purposes */ 41 | +#define BLKDUMPUSEDBUFFERS _IO(0x12,130) 42 | +extern void dump_used_buffers(struct block_device *bdev); 43 | + 44 | static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) 45 | { 46 | struct block_device *bdevp; 47 | @@ -332,6 +336,13 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, 48 | ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg); 49 | unlock_kernel(); 50 | break; 51 | + case BLKDUMPUSEDBUFFERS: 52 | + if (!capable(CAP_SYS_ADMIN)) 53 | + return -EACCES; 54 | + dump_used_buffers(bdev); 55 | + ret = 0; 56 | + break; 57 | + 58 | default: 59 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 60 | } 61 | diff --git a/fs/buffer.c b/fs/buffer.c 62 | index 6fa5302..8438330 100644 63 | --- a/fs/buffer.c 64 | +++ b/fs/buffer.c 65 | @@ -33,6 +33,7 @@ 66 | #include 67 | #include 68 | #include 69 | +#include 70 | #include 71 | #include 72 | #include 73 | @@ -300,6 +301,45 @@ static void free_more_memory(void) 74 | } 75 | } 76 | 77 | +void dump_used_buffers(struct block_device *bdev) 78 | +{ 79 | + struct inode *bd_inode = bdev->bd_inode; 80 | + struct address_space *bd_mapping = bd_inode->i_mapping; 81 | + struct buffer_head *bh, *head; 82 | + struct pagevec pvec; 83 | + unsigned long index = 0; 84 | + int nr_pages, i, count, total = 0; 85 | + char b[BDEVNAME_SIZE]; 86 | + 87 | + spin_lock(&bd_mapping->private_lock); 88 | + printk(KERN_INFO "Begin dump of block device %s\n", bdevname(bdev, b)); 89 | + while (1) { 90 | + nr_pages = pagevec_lookup(&pvec, bd_mapping, index, PAGEVEC_SIZE); 91 | + if (nr_pages == 0) 92 | + break; 93 | + for (i = 0; i < nr_pages; i++) { 94 | + struct page *page = pvec.pages[i]; 95 | + index = page->index + 1; 96 | + 97 | + if (!page_has_buffers(page)) 98 | + continue; 99 | + bh = head = page_buffers(page); 100 | + do { 101 | + count = atomic_read(&bh->b_count); 102 | + if (count) { 103 | + printk(KERN_INFO 104 | + "buffer in-use: block %Lu count %d\n", 105 | + (unsigned long long) bh->b_blocknr, count); 106 | + total++; 107 | + } 108 | + bh = bh->b_this_page; 109 | + } while (bh != head); 110 | + } 111 | + } 112 | + printk(KERN_INFO "Total number of in-use buffers: %d\n", total); 113 | + spin_unlock(&bd_mapping->private_lock); 114 | +} 115 | + 116 | /* 117 | * I/O completion handler for block_read_full_page() - pages 118 | * which come unlocked at the end of I/O. 119 | -------------------------------------------------------------------------------- /old-patches/include-mpage-functions-into-readpage.c: -------------------------------------------------------------------------------- 1 | ext4: copy mpage_readpage() and mpage_readpages() fs/ext4/readpage.c 2 | 3 | Move the functions which we need from fs/mpage.c into 4 | fs/ext4/readpage.c. This will allow us to proceed with the 5 | refactorization of these functions and eventual merger with the 6 | functions in fs/ext4/page_io.c. 7 | 8 | Signed-off-by: Theodore Ts'o 9 | --- 10 | fs/ext4/readpage.c | 326 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 11 | 1 file changed, 320 insertions(+), 6 deletions(-) 12 | 13 | diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c 14 | index b5249db..3b29da1 100644 15 | --- a/fs/ext4/readpage.c 16 | +++ b/fs/ext4/readpage.c 17 | @@ -23,6 +23,7 @@ 18 | #include 19 | #include 20 | #include 21 | +#include 22 | 23 | #include "ext4_jbd2.h" 24 | #include "xattr.h" 25 | @@ -30,31 +31,344 @@ 26 | 27 | #include 28 | 29 | -int ext4_readpage(struct file *file, struct page *page) 30 | +/* 31 | + * I/O completion handler for multipage BIOs. 32 | + * 33 | + * The mpage code never puts partial pages into a BIO (except for end-of-file). 34 | + * If a page does not map to a contiguous run of blocks then it simply falls 35 | + * back to block_read_full_page(). 36 | + * 37 | + * Why is this? If a page's completion depends on a number of different BIOs 38 | + * which can complete in any order (or at the same time) then determining the 39 | + * status of that page is hard. See end_buffer_async_read() for the details. 40 | + * There is no point in duplicating all that complexity. 41 | + */ 42 | +static void mpage_end_io(struct bio *bio, int err) 43 | +{ 44 | + struct bio_vec *bv; 45 | + int i; 46 | + 47 | + bio_for_each_segment_all(bv, bio, i) { 48 | + struct page *page = bv->bv_page; 49 | + page_endio(page, bio_data_dir(bio), err); 50 | + } 51 | + 52 | + bio_put(bio); 53 | +} 54 | + 55 | +static struct bio *mpage_bio_submit(int rw, struct bio *bio) 56 | +{ 57 | + bio->bi_end_io = mpage_end_io; 58 | + submit_bio(rw, bio); 59 | + return NULL; 60 | +} 61 | + 62 | +static struct bio * 63 | +mpage_alloc(struct block_device *bdev, 64 | + sector_t first_sector, int nr_vecs, 65 | + gfp_t gfp_flags) 66 | +{ 67 | + struct bio *bio; 68 | + 69 | + bio = bio_alloc(gfp_flags, nr_vecs); 70 | + 71 | + if (bio == NULL && (current->flags & PF_MEMALLOC)) { 72 | + while (!bio && (nr_vecs /= 2)) 73 | + bio = bio_alloc(gfp_flags, nr_vecs); 74 | + } 75 | + 76 | + if (bio) { 77 | + bio->bi_bdev = bdev; 78 | + bio->bi_iter.bi_sector = first_sector; 79 | + } 80 | + return bio; 81 | +} 82 | + 83 | +/* 84 | + * support function for mpage_readpages. The fs supplied get_block might 85 | + * return an up to date buffer. This is used to map that buffer into 86 | + * the page, which allows readpage to avoid triggering a duplicate call 87 | + * to get_block. 88 | + * 89 | + * The idea is to avoid adding buffers to pages that don't already have 90 | + * them. So when the buffer is up to date and the page size == block size, 91 | + * this marks the page up to date instead of adding new buffers. 92 | + */ 93 | +static void 94 | +map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 95 | +{ 96 | + struct inode *inode = page->mapping->host; 97 | + struct buffer_head *page_bh, *head; 98 | + int block = 0; 99 | + 100 | + if (!page_has_buffers(page)) { 101 | + /* 102 | + * don't make any buffers if there is only one buffer on 103 | + * the page and the page just needs to be set up to date 104 | + */ 105 | + if (inode->i_blkbits == PAGE_CACHE_SHIFT && 106 | + buffer_uptodate(bh)) { 107 | + SetPageUptodate(page); 108 | + return; 109 | + } 110 | + create_empty_buffers(page, 1 << inode->i_blkbits, 0); 111 | + } 112 | + head = page_buffers(page); 113 | + page_bh = head; 114 | + do { 115 | + if (block == page_block) { 116 | + page_bh->b_state = bh->b_state; 117 | + page_bh->b_bdev = bh->b_bdev; 118 | + page_bh->b_blocknr = bh->b_blocknr; 119 | + break; 120 | + } 121 | + page_bh = page_bh->b_this_page; 122 | + block++; 123 | + } while (page_bh != head); 124 | +} 125 | + 126 | +/* 127 | + * This is the worker routine which does all the work of mapping the disk 128 | + * blocks and constructs largest possible bios, submits them for IO if the 129 | + * blocks are not contiguous on the disk. 130 | + * 131 | + * We pass a buffer_head back and forth and use its buffer_mapped() flag to 132 | + * represent the validity of its disk mapping and to decide when to do the next 133 | + * get_block() call. 134 | + */ 135 | +static struct bio * 136 | +do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 137 | + sector_t *last_block_in_bio, struct buffer_head *map_bh, 138 | + unsigned long *first_logical_block, get_block_t get_block) 139 | { 140 | - int ret = -EAGAIN; 141 | struct inode *inode = page->mapping->host; 142 | + const unsigned blkbits = inode->i_blkbits; 143 | + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; 144 | + const unsigned blocksize = 1 << blkbits; 145 | + sector_t block_in_file; 146 | + sector_t last_block; 147 | + sector_t last_block_in_file; 148 | + sector_t blocks[MAX_BUF_PER_PAGE]; 149 | + unsigned page_block; 150 | + unsigned first_hole = blocks_per_page; 151 | + struct block_device *bdev = NULL; 152 | + int length; 153 | + int fully_mapped = 1; 154 | + unsigned nblocks; 155 | + unsigned relative_block; 156 | + 157 | + if (page_has_buffers(page)) 158 | + goto confused; 159 | + 160 | + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 161 | + last_block = block_in_file + nr_pages * blocks_per_page; 162 | + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 163 | + if (last_block > last_block_in_file) 164 | + last_block = last_block_in_file; 165 | + page_block = 0; 166 | + 167 | + /* 168 | + * Map blocks using the result from the previous get_blocks call first. 169 | + */ 170 | + nblocks = map_bh->b_size >> blkbits; 171 | + if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && 172 | + block_in_file < (*first_logical_block + nblocks)) { 173 | + unsigned map_offset = block_in_file - *first_logical_block; 174 | + unsigned last = nblocks - map_offset; 175 | + 176 | + for (relative_block = 0; ; relative_block++) { 177 | + if (relative_block == last) { 178 | + clear_buffer_mapped(map_bh); 179 | + break; 180 | + } 181 | + if (page_block == blocks_per_page) 182 | + break; 183 | + blocks[page_block] = map_bh->b_blocknr + map_offset + 184 | + relative_block; 185 | + page_block++; 186 | + block_in_file++; 187 | + } 188 | + bdev = map_bh->b_bdev; 189 | + } 190 | + 191 | + /* 192 | + * Then do more get_blocks calls until we are done with this page. 193 | + */ 194 | + map_bh->b_page = page; 195 | + while (page_block < blocks_per_page) { 196 | + map_bh->b_state = 0; 197 | + map_bh->b_size = 0; 198 | + 199 | + if (block_in_file < last_block) { 200 | + map_bh->b_size = (last_block-block_in_file) << blkbits; 201 | + if (get_block(inode, block_in_file, map_bh, 0)) 202 | + goto confused; 203 | + *first_logical_block = block_in_file; 204 | + } 205 | + 206 | + if (!buffer_mapped(map_bh)) { 207 | + fully_mapped = 0; 208 | + if (first_hole == blocks_per_page) 209 | + first_hole = page_block; 210 | + page_block++; 211 | + block_in_file++; 212 | + continue; 213 | + } 214 | + 215 | + /* some filesystems will copy data into the page during 216 | + * the get_block call, in which case we don't want to 217 | + * read it again. map_buffer_to_page copies the data 218 | + * we just collected from get_block into the page's buffers 219 | + * so readpage doesn't have to repeat the get_block call 220 | + */ 221 | + if (buffer_uptodate(map_bh)) { 222 | + map_buffer_to_page(page, map_bh, page_block); 223 | + goto confused; 224 | + } 225 | + 226 | + if (first_hole != blocks_per_page) 227 | + goto confused; /* hole -> non-hole */ 228 | + 229 | + /* Contiguous blocks? */ 230 | + if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) 231 | + goto confused; 232 | + nblocks = map_bh->b_size >> blkbits; 233 | + for (relative_block = 0; ; relative_block++) { 234 | + if (relative_block == nblocks) { 235 | + clear_buffer_mapped(map_bh); 236 | + break; 237 | + } else if (page_block == blocks_per_page) 238 | + break; 239 | + blocks[page_block] = map_bh->b_blocknr+relative_block; 240 | + page_block++; 241 | + block_in_file++; 242 | + } 243 | + bdev = map_bh->b_bdev; 244 | + } 245 | + 246 | + if (first_hole != blocks_per_page) { 247 | + zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); 248 | + if (first_hole == 0) { 249 | + SetPageUptodate(page); 250 | + unlock_page(page); 251 | + goto out; 252 | + } 253 | + } else if (fully_mapped) { 254 | + SetPageMappedToDisk(page); 255 | + } 256 | + 257 | + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && 258 | + cleancache_get_page(page) == 0) { 259 | + SetPageUptodate(page); 260 | + goto confused; 261 | + } 262 | + 263 | + /* 264 | + * This page will go to BIO. Do we need to send this BIO off first? 265 | + */ 266 | + if (bio && (*last_block_in_bio != blocks[0] - 1)) 267 | + bio = mpage_bio_submit(READ, bio); 268 | + 269 | +alloc_new: 270 | + if (bio == NULL) { 271 | + if (first_hole == blocks_per_page) { 272 | + if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), 273 | + page)) 274 | + goto out; 275 | + } 276 | + bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), 277 | + min_t(int, nr_pages, bio_get_nr_vecs(bdev)), 278 | + GFP_KERNEL); 279 | + if (bio == NULL) 280 | + goto confused; 281 | + } 282 | + 283 | + length = first_hole << blkbits; 284 | + if (bio_add_page(bio, page, length, 0) < length) { 285 | + bio = mpage_bio_submit(READ, bio); 286 | + goto alloc_new; 287 | + } 288 | + 289 | + relative_block = block_in_file - *first_logical_block; 290 | + nblocks = map_bh->b_size >> blkbits; 291 | + if ((buffer_boundary(map_bh) && relative_block == nblocks) || 292 | + (first_hole != blocks_per_page)) 293 | + bio = mpage_bio_submit(READ, bio); 294 | + else 295 | + *last_block_in_bio = blocks[blocks_per_page - 1]; 296 | +out: 297 | + return bio; 298 | + 299 | +confused: 300 | + if (bio) 301 | + bio = mpage_bio_submit(READ, bio); 302 | + if (!PageUptodate(page)) 303 | + block_read_full_page(page, get_block); 304 | + else 305 | + unlock_page(page); 306 | + goto out; 307 | +} 308 | + 309 | +int ext4_readpage(struct file *file, struct page *page) 310 | +{ 311 | + unsigned long first_logical_block = 0; 312 | + struct buffer_head map_bh; 313 | + struct inode *inode = page->mapping->host; 314 | + struct bio *bio = NULL; 315 | + sector_t last_block_in_bio = 0; 316 | + int ret = -EAGAIN; 317 | 318 | trace_ext4_readpage(page); 319 | 320 | if (ext4_has_inline_data(inode)) 321 | ret = ext4_readpage_inline(inode, page); 322 | 323 | - if (ret == -EAGAIN) 324 | - return mpage_readpage(page, ext4_get_block); 325 | + if (ret != -EAGAIN) 326 | + return ret; 327 | 328 | - return ret; 329 | + map_bh.b_state = 0; 330 | + map_bh.b_size = 0; 331 | + bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, 332 | + &map_bh, &first_logical_block, ext4_get_block); 333 | + if (bio) 334 | + mpage_bio_submit(READ, bio); 335 | + return 0; 336 | } 337 | 338 | int ext4_readpages(struct file *file, struct address_space *mapping, 339 | struct list_head *pages, unsigned nr_pages) 340 | { 341 | struct inode *inode = mapping->host; 342 | + struct bio *bio = NULL; 343 | + unsigned page_idx; 344 | + sector_t last_block_in_bio = 0; 345 | + struct buffer_head map_bh; 346 | + unsigned long first_logical_block = 0; 347 | 348 | /* If the file has inline data, no need to do readpages. */ 349 | if (ext4_has_inline_data(inode)) 350 | return 0; 351 | 352 | - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 353 | + map_bh.b_state = 0; 354 | + map_bh.b_size = 0; 355 | + for (page_idx = 0; page_idx < nr_pages; page_idx++) { 356 | + struct page *page = list_entry(pages->prev, struct page, lru); 357 | + 358 | + prefetchw(&page->flags); 359 | + list_del(&page->lru); 360 | + if (!add_to_page_cache_lru(page, mapping, 361 | + page->index, GFP_KERNEL)) { 362 | + bio = do_mpage_readpage(bio, page, 363 | + nr_pages - page_idx, 364 | + &last_block_in_bio, &map_bh, 365 | + &first_logical_block, 366 | + ext4_get_block); 367 | + } 368 | + page_cache_release(page); 369 | + } 370 | + BUG_ON(!list_empty(pages)); 371 | + if (bio) 372 | + mpage_bio_submit(READ, bio); 373 | + return 0; 374 | } 375 | 376 | -------------------------------------------------------------------------------- /old-patches/inline-ext4_get_block-into-readpage: -------------------------------------------------------------------------------- 1 | ext4: call ext4_map_blocks() directly from read_page.c 2 | 3 | Use ext4_map_blocks() directly instead of going through 4 | ext4_get_block(). This allows us to drop out a lot of generic code 5 | that was not needed for ext4. 6 | 7 | Signed-off-by: Theodore Ts'o 8 | 9 | 10 | --- 11 | fs/ext4/readpage.c | 83 ++++++++++++++++++----------------------------------------------------------------- 12 | 1 file changed, 18 insertions(+), 65 deletions(-) 13 | 14 | diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c 15 | index 3b29da1..ce3ecc1 100644 16 | --- a/fs/ext4/readpage.c 17 | +++ b/fs/ext4/readpage.c 18 | @@ -85,49 +85,6 @@ mpage_alloc(struct block_device *bdev, 19 | } 20 | 21 | /* 22 | - * support function for mpage_readpages. The fs supplied get_block might 23 | - * return an up to date buffer. This is used to map that buffer into 24 | - * the page, which allows readpage to avoid triggering a duplicate call 25 | - * to get_block. 26 | - * 27 | - * The idea is to avoid adding buffers to pages that don't already have 28 | - * them. So when the buffer is up to date and the page size == block size, 29 | - * this marks the page up to date instead of adding new buffers. 30 | - */ 31 | -static void 32 | -map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 33 | -{ 34 | - struct inode *inode = page->mapping->host; 35 | - struct buffer_head *page_bh, *head; 36 | - int block = 0; 37 | - 38 | - if (!page_has_buffers(page)) { 39 | - /* 40 | - * don't make any buffers if there is only one buffer on 41 | - * the page and the page just needs to be set up to date 42 | - */ 43 | - if (inode->i_blkbits == PAGE_CACHE_SHIFT && 44 | - buffer_uptodate(bh)) { 45 | - SetPageUptodate(page); 46 | - return; 47 | - } 48 | - create_empty_buffers(page, 1 << inode->i_blkbits, 0); 49 | - } 50 | - head = page_buffers(page); 51 | - page_bh = head; 52 | - do { 53 | - if (block == page_block) { 54 | - page_bh->b_state = bh->b_state; 55 | - page_bh->b_bdev = bh->b_bdev; 56 | - page_bh->b_blocknr = bh->b_blocknr; 57 | - break; 58 | - } 59 | - page_bh = page_bh->b_this_page; 60 | - block++; 61 | - } while (page_bh != head); 62 | -} 63 | - 64 | -/* 65 | * This is the worker routine which does all the work of mapping the disk 66 | * blocks and constructs largest possible bios, submits them for IO if the 67 | * blocks are not contiguous on the disk. 68 | @@ -138,8 +95,8 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 69 | */ 70 | static struct bio * 71 | do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 72 | - sector_t *last_block_in_bio, struct buffer_head *map_bh, 73 | - unsigned long *first_logical_block, get_block_t get_block) 74 | + sector_t *last_block_in_bio, struct buffer_head *map_bh, 75 | + unsigned long *first_logical_block) 76 | { 77 | struct inode *inode = page->mapping->host; 78 | const unsigned blkbits = inode->i_blkbits; 79 | @@ -151,7 +108,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 80 | sector_t blocks[MAX_BUF_PER_PAGE]; 81 | unsigned page_block; 82 | unsigned first_hole = blocks_per_page; 83 | - struct block_device *bdev = NULL; 84 | + struct block_device *bdev = inode->i_sb->s_bdev; 85 | int length; 86 | int fully_mapped = 1; 87 | unsigned nblocks; 88 | @@ -188,7 +145,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 89 | page_block++; 90 | block_in_file++; 91 | } 92 | - bdev = map_bh->b_bdev; 93 | } 94 | 95 | /* 96 | @@ -200,9 +156,19 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 97 | map_bh->b_size = 0; 98 | 99 | if (block_in_file < last_block) { 100 | - map_bh->b_size = (last_block-block_in_file) << blkbits; 101 | - if (get_block(inode, block_in_file, map_bh, 0)) 102 | + struct ext4_map_blocks map; 103 | + int ret; 104 | + 105 | + map.m_lblk = block_in_file; 106 | + map.m_len = last_block - block_in_file; 107 | + ret = ext4_map_blocks(NULL, inode, &map, 0); 108 | + if (ret < 0) 109 | goto confused; 110 | + map_bh->b_blocknr = map.m_pblk; 111 | + map_bh->b_bdev = bdev; 112 | + map_bh->b_size = inode->i_sb->s_blocksize * map.m_len; 113 | + map_bh->b_state = map.m_flags; 114 | + 115 | *first_logical_block = block_in_file; 116 | } 117 | 118 | @@ -215,17 +181,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 119 | continue; 120 | } 121 | 122 | - /* some filesystems will copy data into the page during 123 | - * the get_block call, in which case we don't want to 124 | - * read it again. map_buffer_to_page copies the data 125 | - * we just collected from get_block into the page's buffers 126 | - * so readpage doesn't have to repeat the get_block call 127 | - */ 128 | - if (buffer_uptodate(map_bh)) { 129 | - map_buffer_to_page(page, map_bh, page_block); 130 | - goto confused; 131 | - } 132 | - 133 | if (first_hole != blocks_per_page) 134 | goto confused; /* hole -> non-hole */ 135 | 136 | @@ -243,7 +198,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, 137 | page_block++; 138 | block_in_file++; 139 | } 140 | - bdev = map_bh->b_bdev; 141 | } 142 | 143 | if (first_hole != blocks_per_page) { 144 | @@ -303,7 +257,7 @@ confused: 145 | if (bio) 146 | bio = mpage_bio_submit(READ, bio); 147 | if (!PageUptodate(page)) 148 | - block_read_full_page(page, get_block); 149 | + block_read_full_page(page, ext4_get_block); 150 | else 151 | unlock_page(page); 152 | goto out; 153 | @@ -329,7 +283,7 @@ int ext4_readpage(struct file *file, struct page *page) 154 | map_bh.b_state = 0; 155 | map_bh.b_size = 0; 156 | bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, 157 | - &map_bh, &first_logical_block, ext4_get_block); 158 | + &map_bh, &first_logical_block); 159 | if (bio) 160 | mpage_bio_submit(READ, bio); 161 | return 0; 162 | @@ -361,8 +315,7 @@ int ext4_readpages(struct file *file, struct address_space *mapping, 163 | bio = do_mpage_readpage(bio, page, 164 | nr_pages - page_idx, 165 | &last_block_in_bio, &map_bh, 166 | - &first_logical_block, 167 | - ext4_get_block); 168 | + &first_logical_block); 169 | } 170 | page_cache_release(page); 171 | } 172 | -------------------------------------------------------------------------------- /old-patches/move-read-page-functions-to-new-file: -------------------------------------------------------------------------------- 1 | ext4: move ext4_readpage() and ext4_readpages() to their own file 2 | 3 | In preparation for weaning ext4 completely off of fs/mpage.c, move the 4 | readpage[s] function to their own file. Eventually we'll probably end 5 | up moving the writepage[s] function here and renaming this to 6 | something like read_write_page.c, or some such, but for now, let's 7 | keep things simple. 8 | 9 | Signed-off-by: Theodore Ts'o 10 | --- 11 | fs/ext4/Makefile | 2 +- 12 | fs/ext4/ext4.h | 5 +++++ 13 | fs/ext4/inode.c | 29 ----------------------------- 14 | fs/ext4/readpage.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 15 | 4 files changed, 66 insertions(+), 30 deletions(-) 16 | 17 | diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile 18 | index 0310fec..cd6f50f 100644 19 | --- a/fs/ext4/Makefile 20 | +++ b/fs/ext4/Makefile 21 | @@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 22 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 23 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 24 | mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ 25 | - xattr_trusted.o inline.o 26 | + xattr_trusted.o inline.o readpage.o 27 | 28 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 29 | ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o 30 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h 31 | index f70c3fc..5c115ea 100644 32 | --- a/fs/ext4/ext4.h 33 | +++ b/fs/ext4/ext4.h 34 | @@ -2775,6 +2775,11 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, 35 | struct writeback_control *wbc, 36 | bool keep_towrite); 37 | 38 | +/* readpage.c */ 39 | +extern int ext4_readpage(struct file *file, struct page *page); 40 | +extern int ext4_readpages(struct file *file, struct address_space *mapping, 41 | + struct list_head *pages, unsigned nr_pages); 42 | + 43 | /* mmp.c */ 44 | extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 45 | 46 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 47 | index d5dd7d4..b3c7b92 100644 48 | --- a/fs/ext4/inode.c 49 | +++ b/fs/ext4/inode.c 50 | @@ -2798,35 +2798,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 51 | return generic_block_bmap(mapping, block, ext4_get_block); 52 | } 53 | 54 | -static int ext4_readpage(struct file *file, struct page *page) 55 | -{ 56 | - int ret = -EAGAIN; 57 | - struct inode *inode = page->mapping->host; 58 | - 59 | - trace_ext4_readpage(page); 60 | - 61 | - if (ext4_has_inline_data(inode)) 62 | - ret = ext4_readpage_inline(inode, page); 63 | - 64 | - if (ret == -EAGAIN) 65 | - return mpage_readpage(page, ext4_get_block); 66 | - 67 | - return ret; 68 | -} 69 | - 70 | -static int 71 | -ext4_readpages(struct file *file, struct address_space *mapping, 72 | - struct list_head *pages, unsigned nr_pages) 73 | -{ 74 | - struct inode *inode = mapping->host; 75 | - 76 | - /* If the file has inline data, no need to do readpages. */ 77 | - if (ext4_has_inline_data(inode)) 78 | - return 0; 79 | - 80 | - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 81 | -} 82 | - 83 | static void ext4_invalidatepage(struct page *page, unsigned int offset, 84 | unsigned int length) 85 | { 86 | diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c 87 | new file mode 100644 88 | index 0000000..b5249db 89 | --- /dev/null 90 | +++ b/fs/ext4/readpage.c 91 | @@ -0,0 +1,60 @@ 92 | +/* 93 | + * linux/fs/ext4/readpage.c 94 | + */ 95 | + 96 | +#include 97 | +#include 98 | +#include 99 | +#include 100 | +#include 101 | +#include 102 | +#include 103 | +#include 104 | +#include 105 | +#include 106 | +#include 107 | +#include 108 | +#include 109 | +#include 110 | +#include 111 | +#include 112 | +#include 113 | +#include 114 | +#include 115 | +#include 116 | +#include 117 | + 118 | +#include "ext4_jbd2.h" 119 | +#include "xattr.h" 120 | +#include "acl.h" 121 | + 122 | +#include 123 | + 124 | +int ext4_readpage(struct file *file, struct page *page) 125 | +{ 126 | + int ret = -EAGAIN; 127 | + struct inode *inode = page->mapping->host; 128 | + 129 | + trace_ext4_readpage(page); 130 | + 131 | + if (ext4_has_inline_data(inode)) 132 | + ret = ext4_readpage_inline(inode, page); 133 | + 134 | + if (ret == -EAGAIN) 135 | + return mpage_readpage(page, ext4_get_block); 136 | + 137 | + return ret; 138 | +} 139 | + 140 | +int ext4_readpages(struct file *file, struct address_space *mapping, 141 | + struct list_head *pages, unsigned nr_pages) 142 | +{ 143 | + struct inode *inode = mapping->host; 144 | + 145 | + /* If the file has inline data, no need to do readpages. */ 146 | + if (ext4_has_inline_data(inode)) 147 | + return 0; 148 | + 149 | + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 150 | +} 151 | + 152 | -------------------------------------------------------------------------------- /old-patches/only-call-ext4_truncate-if-there-is-data-to-truncate: -------------------------------------------------------------------------------- 1 | ext4: in ext4_setattr(), only call ext4_truncate() if there is no data to drop 2 | 3 | If there are no blocks associated with the inode (and no inline data), 4 | there's no point calling ext4_truncate(). This avoids setting the 5 | replace-via-truncate hueristic if there is an attempt to truncate a 6 | file which is already zero-length --- which is something that happens 7 | in the core dumping code, in case there is an already existing core 8 | file. In the comon case, there is not a previous core file, so by not 9 | enabling the replace-via-truncate hueristic, we can speed up core 10 | dumps. 11 | 12 | Reported-by: Omar Sandoval 13 | Signed-off-by: Theodore Ts'o 14 | --- 15 | fs/ext4/inode.c | 2 +- 16 | 1 file changed, 1 insertion(+), 1 deletion(-) 17 | 18 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c 19 | index 44ee5d9..cd757f8 100644 20 | --- a/fs/ext4/inode.c 21 | +++ b/fs/ext4/inode.c 22 | @@ -5171,7 +5171,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) 23 | * in data=journal mode to make pages freeable. 24 | */ 25 | truncate_pagecache(inode, inode->i_size); 26 | - if (shrink) 27 | + if (shrink && (inode->i_blocks || ext4_has_inline_data(inode))) 28 | ext4_truncate(inode); 29 | up_write(&EXT4_I(inode)->i_mmap_sem); 30 | } 31 | -------------------------------------------------------------------------------- /old-patches/series: -------------------------------------------------------------------------------- 1 | only-call-ext4_truncate-if-there-is-data-to-truncate 2 | 3 | #crypto-rename-ext4_get_encryption_info 4 | 5 | crypto-add-ciphertext_access-mount-option 6 | crypto-add-ioctls-to-backup-crypto-metadata 7 | 8 | add-encryption-debug-files 9 | 10 | # not yet ready 11 | #dont-use-io-end-if-not-needed 12 | 13 | # not yet ready; patch series so ext4 has has full responsibility 14 | # for ext4_readpage[s] and does not use mpage. 15 | # 16 | #move-read-page-functions-to-new-file 17 | #include-mpage-functions-into-readpage.c 18 | #inline-ext4_get_block-into-readpage 19 | 20 | add-fallocate-mode-blocking-for-debugging 21 | 22 | # use-discard-if-possible-in-blkdev_issue_zeroout 23 | add-blkdiscard-ioctl 24 | 25 | block-dio-during-truncate 26 | 27 | delalloc-debug 28 | 29 | # note: this may make things slower... 30 | commit-as-soon-as-possible-after-log_start_commit 31 | 32 | # Ted's squelch series, still needs work 33 | add-sysfs-bool-support 34 | add-squelch-errors-support 35 | 36 | # Various disabled patches... 37 | # 38 | #auto-enable-journal_async_commit 39 | #mballoc-allocate-larger-extents 40 | 41 | # various debugging/benchmarking assists 42 | dump-in-use-buffers 43 | akpm-jbd2-locking-fix 44 | 45 | -------------------------------------------------------------------------------- /old-patches/use-discard-if-possible-in-blkdev_issue_zeroout: -------------------------------------------------------------------------------- 1 | block: use discard if possible in blkdev_issue_zeroout() 2 | 3 | If the block device supports discards and guarantees that subsequent 4 | reads will return zeros (sometimes known as DZAT, for Deterministic 5 | read Zeros After Trim), use this to implement blkdev_issue_zeroout() 6 | 7 | Signed-off-by: "Theodore Ts'o" 8 | --- 9 | block/blk-lib.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 10 | 1 file changed, 62 insertions(+), 10 deletions(-) 11 | 12 | diff --git a/block/blk-lib.c b/block/blk-lib.c 13 | index 2da76c9..62cbf28 100644 14 | --- a/block/blk-lib.c 15 | +++ b/block/blk-lib.c 16 | @@ -269,6 +269,32 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 17 | return ret; 18 | } 19 | 20 | +static int issue_zeroout_or_write_same(struct block_device *bdev, 21 | + sector_t sector, 22 | + sector_t nr_sects, gfp_t gfp_mask) 23 | +{ 24 | + if (bdev_write_same(bdev)) { 25 | + unsigned char bdn[BDEVNAME_SIZE]; 26 | + 27 | + if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, 28 | + ZERO_PAGE(0))) 29 | + return 0; 30 | + 31 | + bdevname(bdev, bdn); 32 | + pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); 33 | + } 34 | + 35 | + return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); 36 | +} 37 | + 38 | +/* 39 | + * Like sector_div except don't modify s. 40 | + */ 41 | +static unsigned int sector_mod(sector_t s, unsigned int m) 42 | +{ 43 | + return sector_div(s, m); 44 | +} 45 | + 46 | /** 47 | * blkdev_issue_zeroout - zero-fill a block range 48 | * @bdev: blockdev to write 49 | @@ -277,23 +303,49 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 50 | * @gfp_mask: memory allocation flags (for bio_alloc) 51 | * 52 | * Description: 53 | - * Generate and issue number of bios with zerofiled pages. 54 | + * Issues bios which zeros the requested block range. 55 | */ 56 | - 57 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 58 | sector_t nr_sects, gfp_t gfp_mask) 59 | { 60 | - if (bdev_write_same(bdev)) { 61 | - unsigned char bdn[BDEVNAME_SIZE]; 62 | + struct request_queue *q = bdev_get_queue(bdev); 63 | + unsigned int alignment, granularity; 64 | + unsigned int c; 65 | + int ret; 66 | 67 | - if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, 68 | - ZERO_PAGE(0))) 69 | - return 0; 70 | + if (!q) 71 | + return -ENXIO; 72 | 73 | - bdevname(bdev, bdn); 74 | - pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); 75 | + if (!blk_queue_discard(q) || !queue_discard_zeroes_data(q) || 76 | + q->limits.discard_misaligned) 77 | + return issue_zeroout_or_write_same(bdev, sector, 78 | + 79 | + nr_sects, gfp_mask); 80 | + 81 | + alignment = q->limits.discard_alignment >> 9; 82 | + granularity = q->limits.discard_granularity >> 9; 83 | + 84 | + c = sector_mod(granularity + alignment - sector, granularity); 85 | + if (c > nr_sects) 86 | + c = nr_sects; 87 | + 88 | + if (c) { 89 | + int ret = issue_zeroout_or_write_same(bdev, sector, 90 | + c, gfp_mask); 91 | + if (ret) 92 | + return ret; 93 | + nr_sects -= c; 94 | } 95 | + if (nr_sects == 0) 96 | + return 0; 97 | 98 | - return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); 99 | + c = sector_mod(nr_sects, granularity); 100 | + 101 | + ret = blkdev_issue_discard(bdev, sector, nr_sects - c, gfp_mask, 0); 102 | + if (ret || c == 0) 103 | + return ret; 104 | + 105 | + return issue_zeroout_or_write_same(bdev, sector + nr_sects - c, c, 106 | + gfp_mask); 107 | } 108 | EXPORT_SYMBOL(blkdev_issue_zeroout); 109 | -------------------------------------------------------------------------------- /series: -------------------------------------------------------------------------------- 1 | # v5.2-rc2 2 | 3 | #################################################### 4 | # unstable patches 5 | #################################################### 6 | 7 | stable-boundary 8 | stable-boundary-undo.patch 9 | 10 | # Lazy journalling patches 11 | jbd2-dont-double-bump-transaction-number 12 | journal-superblock-changes 13 | add-journal-no-cleanup-option 14 | add-support-for-log-metadata-block-tracking-in-log 15 | add-indirection-to-metadata-block-read-paths 16 | cleaner 17 | load-jmap-from-journal 18 | disable-writeback 19 | add-ext4-journal-lazy-mount-option 20 | #end lazy journal patches 21 | 22 | -------------------------------------------------------------------------------- /stable-boundary: -------------------------------------------------------------------------------- 1 | ext4: Stable/Unstable boundary 2 | 3 | From: Theodore Ts'o 4 | 5 | This is the boundary between the stable and unstable patches in the 6 | ext4 patch queue. 7 | 8 | Signed-off-by: "Theodore Ts'o" 9 | --- 10 | 11 | fs/ext4/extents.c | 1 + 12 | 1 file changed, 1 insertion(+) 13 | 14 | 15 | Index: linux-2.6.26-rc9/fs/ext4/extents.c 16 | =================================================================== 17 | --- linux-2.6.26-rc9.orig/fs/ext4/extents.c 2008-07-11 16:05:13.000000000 -0700 18 | +++ linux-2.6.26-rc9/fs/ext4/extents.c 2008-07-11 16:05:17.000000000 -0700 19 | @@ -27,6 +27,7 @@ 20 | * - ext4*_error() should be used in some situations 21 | * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 22 | * - smart tree reduction 23 | + * stable boundary change 24 | */ 25 | 26 | #include 27 | -------------------------------------------------------------------------------- /stable-boundary-undo.patch: -------------------------------------------------------------------------------- 1 | ext4: undo the stable boundary patch changes 2 | 3 | From: Aneesh Kumar K.V 4 | 5 | This helps in applying the series with different type of 6 | tools that expect a code diff to apply any patch. 7 | 8 | Signed-off-by: Aneesh Kumar K.V 9 | Signed-off-by: "Theodore Ts'o" 10 | --- 11 | 12 | fs/ext4/extents.c | 1 - 13 | 1 file changed, 1 deletion(-) 14 | 15 | 16 | Index: linux-2.6.26-rc9/fs/ext4/extents.c 17 | =================================================================== 18 | --- linux-2.6.26-rc9.orig/fs/ext4/extents.c 2008-07-11 16:05:17.000000000 -0700 19 | +++ linux-2.6.26-rc9/fs/ext4/extents.c 2008-07-11 16:05:17.000000000 -0700 20 | @@ -27,7 +27,6 @@ 21 | * - ext4*_error() should be used in some situations 22 | * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 23 | * - smart tree reduction 24 | - * stable boundary change 25 | */ 26 | 27 | #include 28 | -------------------------------------------------------------------------------- /timestamps: -------------------------------------------------------------------------------- 1 | touch -d @1421646888 archive 2 | touch -d @1493511621 old-patches 3 | touch -d @1543184491 stable-boundary-undo.patch 4 | touch -d @1543184551 jbd2-dont-double-bump-transaction-number 5 | touch -d @1543184611 journal-superblock-changes 6 | touch -d @1543184671 add-journal-no-cleanup-option 7 | touch -d @1543184731 add-support-for-log-metadata-block-tracking-in-log 8 | touch -d @1543184791 add-indirection-to-metadata-block-read-paths 9 | touch -d @1543184851 cleaner 10 | touch -d @1543184911 load-jmap-from-journal 11 | touch -d @1543184971 disable-writeback 12 | touch -d @1543185031 add-ext4-journal-lazy-mount-option 13 | touch -d @1558669179 stable-boundary 14 | touch -d @1558930704 status 15 | touch -d @1558930766 save-patch 16 | touch -d @1558930859 series 17 | touch -d @1558930873 timestamps 18 | --------------------------------------------------------------------------------