diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 20:37:15 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 20:37:15 -0800 |
| commit | fbeea4db51a6eaf62b4784f718844726dd2199b9 (patch) | |
| tree | f869d319cb4b2036afdf472572b64b1bab9aa836 /fs/ext4 | |
| parent | afcbce74f358a540761aa893939590a667162dff (diff) | |
| parent | 91ef18b567dae84c0cea9b996d933c856e366f52 (diff) | |
| download | linux-fbeea4db51a6eaf62b4784f718844726dd2199b9.tar.gz linux-fbeea4db51a6eaf62b4784f718844726dd2199b9.tar.bz2 linux-fbeea4db51a6eaf62b4784f718844726dd2199b9.zip | |
Merge tag 'ext4_for_linus-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"New features and improvements for the ext4 file system:
- Optimize online defragmentation by using folios instead of
individual buffer heads
- Improve error codes stored in the superblock when the journal
aborts
- Minor cleanups and clarifications in ext4_map_blocks()
- Add documentation of the casefold and encrypt flags
- Add support for file systems with a blocksize greater than the
pagesize
- Improve performance by enabling the caching the fact that an inode
does not have a Posix ACL
Various Bug Fixes:
- Fix false positive complaints from smatch
- Fix error code which is returned by ext4fs_dirhash() when Siphash
is used without the encryption key
- Fix races when writing to inline data files which could trigger a
BUG
- Fix potential NULL dereference when there is an corrupt file system
with an extended attribute value stored in a inode
- Fix false positive lockdep report when syzbot uses ext4 and ocfs2
together
- Fix false positive reported by DEPT by adjusting lock annotation
- Avoid a potential BUG_ON in jbd2 when a file system is massively
corrupted
- Fix a WARN_ON when superblock is corrupted with a non-NULL
terminated mount options field
- Add check if the userspace passes in a non-NULL terminated mount
options field to EXT4_IOC_SET_TUNE_SB_PARAM
- Fix a potential journal checksum failure whena file system is
copied while it is mounted read-only
- Fix a potential potential orphan file tracking error which only
showed on 32-bit systems
- Fix assertion checks in mballoc (which have to be explicitly enbled
by manually enabling AGGRESSIVE_CHECKS and recompiling)
- Avoid complaining about overly large orphan files created by mke2fs
with with file systems with a 64k block size"
* tag 'ext4_for_linus-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (58 commits)
ext4: mark inodes without acls in __ext4_iget()
ext4: enable block size larger than page size
ext4: add checks for large folio incompatibilities when BS > PS
ext4: support verifying data from large folios with fs-verity
ext4: make data=journal support large block size
ext4: support large block size in __ext4_block_zero_page_range()
ext4: support large block size in mpage_prepare_extent_to_map()
ext4: support large block size in mpage_map_and_submit_buffers()
ext4: support large block size in ext4_block_write_begin()
ext4: support large block size in ext4_mpage_readpages()
ext4: rename 'page' references to 'folio' in multi-block allocator
ext4: prepare buddy cache inode for BS > PS with large folios
ext4: support large block size in ext4_mb_init_cache()
ext4: support large block size in ext4_mb_get_buddy_page_lock()
ext4: support large block size in ext4_mb_load_buddy_gfp()
ext4: add EXT4_LBLK_TO_PG and EXT4_PG_TO_LBLK for block/page conversion
ext4: add EXT4_LBLK_TO_B macro for logical block to bytes conversion
ext4: support large block size in ext4_readdir()
ext4: support large block size in ext4_calculate_overhead()
ext4: introduce s_min_folio_order for future BS > PS support
...
Diffstat (limited to 'fs/ext4')
| -rw-r--r-- | fs/ext4/balloc.c | 2 | ||||
| -rw-r--r-- | fs/ext4/dir.c | 8 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 50 | ||||
| -rw-r--r-- | fs/ext4/ext4_jbd2.c | 3 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 28 | ||||
| -rw-r--r-- | fs/ext4/extents_status.c | 31 | ||||
| -rw-r--r-- | fs/ext4/extents_status.h | 2 | ||||
| -rw-r--r-- | fs/ext4/hash.c | 2 | ||||
| -rw-r--r-- | fs/ext4/ialloc.c | 1 | ||||
| -rw-r--r-- | fs/ext4/inline.c | 14 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 165 | ||||
| -rw-r--r-- | fs/ext4/ioctl.c | 14 | ||||
| -rw-r--r-- | fs/ext4/mballoc.c | 188 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 786 | ||||
| -rw-r--r-- | fs/ext4/namei.c | 18 | ||||
| -rw-r--r-- | fs/ext4/orphan.c | 4 | ||||
| -rw-r--r-- | fs/ext4/readpage.c | 7 | ||||
| -rw-r--r-- | fs/ext4/super.c | 72 | ||||
| -rw-r--r-- | fs/ext4/sysfs.c | 6 | ||||
| -rw-r--r-- | fs/ext4/verity.c | 2 | ||||
| -rw-r--r-- | fs/ext4/xattr.c | 6 |
21 files changed, 725 insertions, 684 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c9329ed5c094..8040c731b3e4 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -752,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, *count = ar.len; /* * Account for the allocated meta blocks. We will never - * fail EDQUOT for metdata, but we do account for it. + * fail EDQUOT for metadata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d4164c507a90..256fe2c1d4c1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) continue; } if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_SHIFT - inode->i_blkbits); + pgoff_t index = map.m_pblk << inode->i_blkbits >> + PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, - &file->f_ra, file, - index, 1); + &file->f_ra, file, index, + 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57087da6c7be..56112f201cac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -260,6 +260,7 @@ struct ext4_map_blocks { ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; + u64 m_seq; }; /* @@ -367,7 +368,14 @@ struct ext4_io_submit { blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) - +#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) + +/* Translate a block number to a page index */ +#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ + PAGE_SHIFT) +/* Translate a page index to a block number */ +#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ + (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ @@ -694,13 +702,22 @@ enum { /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* caller is from the direct IO path, request to creation of an - unwritten extents if not allocated, split the unwritten - extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_PRE_IO 0x0008 -#define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + /* + * This means that we cannot merge newly allocated extents, and if we + * found an unwritten extent, we need to split it. + */ +#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 + /* + * Caller is from the dio or dioread_nolock buffered IO, reqest to + * create an unwritten extent if it does not exist or split the + * found unwritten extent. Also do not merge the newly created + * unwritten extent, io end will convert unwritten to written, + * and try to merge the written extent. + */ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert unwritten extent to initialized. */ +#define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -1138,6 +1155,8 @@ struct ext4_inode_info { ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ + u64 i_es_seq; /* Change counter for extents. + Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -1685,6 +1704,11 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; + /* minimum folio order of a page cache allocation */ + u16 s_min_folio_order; + /* supported maximum folio order, 0 means not supported */ + u16 s_max_folio_order; + /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; @@ -2472,28 +2496,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len, return (rec_len & ~EXT4_DIR_ROUND); } -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); -#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2503,9 +2518,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif } /* diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index a0e66bc10093..05e5946ed9b3 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC) && - !mapping_large_folio_support(inode->i_mapping))) { + !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ca5499e9412b..2cf5759ba689 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -333,7 +333,7 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, int nofail) { int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); - int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; @@ -2002,7 +2002,7 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } /* try to insert block into found extent and return */ - if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) { /* * Try to see whether we should rather test the extent on @@ -2181,7 +2181,7 @@ has_space: merge: /* try to merge extents */ - if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ @@ -2213,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, while (block <= end) { next = 0; flags = 0; - if (!ext4_es_lookup_extent(inode, block, &next, &es)) + if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; @@ -3224,7 +3224,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, else ext4_ext_mark_initialized(ex); - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); @@ -3368,7 +3368,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; @@ -3721,10 +3721,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully inside i_size or new_size. - */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); @@ -3735,11 +3731,15 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } - flags |= EXT4_GET_BLOCKS_PRE_IO; + flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; return ext4_split_extent(handle, inode, path, map, split_flag, flags, allocated); } @@ -3911,7 +3911,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, *allocated, newblock); /* get_block() before submitting IO, split the extent */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) { + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { path = ext4_split_convert_extents(handle, inode, map, path, flags | EXT4_GET_BLOCKS_CONVERT, allocated); if (IS_ERR(path)) @@ -4562,7 +4562,7 @@ retry: * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = (loff_t)(map.m_lblk + ret) << blkbits; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) @@ -5618,7 +5618,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) path = ext4_split_extent_at(handle, inode, path, start_lblk, split_flag, EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 31dc0496f8d0..e04fbf10fe4f 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es) return es->es_lblk + es->es_len - 1; } +static inline void ext4_es_inc_seq(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); - trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -955,6 +961,11 @@ retry: } pending = err3; } + /* + * TODO: For cache on-disk extents, there is no need to increment + * the sequence counter, this requires future optimization. + */ + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); /* @@ -981,6 +992,7 @@ error: if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_extent(inode, &newes); ext4_es_print_tree(inode); return; } @@ -1027,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t *next_lblk, - struct extent_status *es) + ext4_lblk_t *next_lblk, struct extent_status *es, + u64 *pseq) { struct ext4_es_tree *tree; struct ext4_es_stats *stats; @@ -1087,6 +1099,8 @@ out: } else *next_lblk = 0; } + if (pseq) + *pseq = EXT4_I(inode)->i_es_seq; } else { percpu_counter_inc(&stats->es_stats_cache_misses); } @@ -1550,7 +1564,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); @@ -1570,16 +1583,21 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); + if (err) + goto error; /* Free preallocated extent if it didn't get used. */ if (es) { if (!es->es_len) __es_free_extent(es); es = NULL; } + ext4_es_inc_seq(inode); +error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; + trace_ext4_es_remove_extent(inode, lblk, len); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); } @@ -2140,8 +2158,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); - trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, - end_allocated); ext4_es_insert_extent_check(inode, &newes); @@ -2196,11 +2212,14 @@ retry: pr2 = NULL; } } + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); ext4_es_print_tree(inode); ext4_print_pending_tree(inode); return; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8f9c008d11e8..f3396cf32b44 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -148,7 +148,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, - struct extent_status *es); + struct extent_status *es, u64 *pseq); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 33cd5b6b02d5..48483cd015d3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -268,7 +268,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, combined_hash = fscrypt_fname_siphash(dir, &qname); } else { ext4_warning_inode(dir, "Siphash requires key"); - return -1; + return -EINVAL; } hash = (__u32)(combined_hash >> 32); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ba4fd9aba1c1..b20a1bf866ab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1293,7 +1293,6 @@ got: ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1b094a4f3866..1f6bc05593df 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -418,7 +418,12 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); - + /* + * ei->i_inline_size may have changed since the initial check + * if other xattrs were added. Recalculate to ensure + * ext4_update_inline_data() validates against current capacity. + */ + (void) ext4_find_inline_data_nolock(inode); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else @@ -446,9 +451,13 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (!ei->i_inline_off) return 0; + down_write(&ei->i_data_sem); + error = ext4_get_inode_loc(inode, &is.iloc); - if (error) + if (error) { + up_write(&ei->i_data_sem); return error; + } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) @@ -487,6 +496,7 @@ out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; + up_write(&ei->i_data_sem); return error; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 78ea864fa8cd..0c466ccbed69 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -549,10 +549,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval <= 0) + if (retval < 0) return retval; + /* A hole? */ + if (retval == 0) + goto out; + if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " @@ -572,11 +575,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); - return retval; + } else { + retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } - - return ext4_map_query_blocks_next_in_leaf(handle, inode, map, - orig_mlen); +out: + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -647,8 +652,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * If the extent has been zeroed out, we don't need to update * extent status tree. */ - if (flags & EXT4_GET_BLOCKS_PRE_IO && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; } @@ -657,6 +662,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); return retval; } @@ -722,7 +728,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -809,7 +815,13 @@ found: down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + if (retval <= 0) + return retval; + + if (map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; @@ -824,9 +836,8 @@ found: !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -838,12 +849,8 @@ found: return ret; } } - if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || - map->m_flags & EXT4_MAP_MAPPED)) - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); - if (retval < 0) - ext_debug(inode, "failed with err %d\n", retval); + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + + map->m_len - 1); return retval; } @@ -1162,8 +1169,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; @@ -1172,12 +1178,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { @@ -1907,7 +1913,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); @@ -1960,7 +1966,7 @@ add_delayed: * is held in write mode, before inserting a new da entry in * the extent status tree. */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); @@ -1978,6 +1984,8 @@ add_delayed: map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + if (!retval) + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); up_write(&EXT4_I(inode)->i_data_sem); return retval; @@ -2224,7 +2232,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2250,7 +2257,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2260,7 +2268,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2290,15 +2298,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2309,7 +2316,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - lblk = folio->index << bpp_bits; + lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2462,7 +2469,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -2612,7 +2619,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; @@ -2648,7 +2654,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ @@ -2726,8 +2732,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); @@ -3499,8 +3504,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && |
