From 8e4e5cdf2fdeb99445a468b6b6436ad79b9ecb30 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:39:56 +0800 Subject: ext4: factor out a common helper to query extent map Factor out a new common helper ext4_map_query_blocks() from the ext4_da_map_blocks(), it query and return the extent map status on the inode's extent path, no logic changes. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/20240517124005.347221-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4bae9ccf5fe0..168819b4db01 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map) +{ + unsigned int status; + int retval; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, 0); + else + retval = ext4_ind_map_blocks(handle, inode, map, 0); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1744,33 +1773,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - if (retval < 0) { - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } + retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); -- cgit v1.2.3 From 0ea6560abb3bac1ffcfa4bf6b2c4d344fdc27b3c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:39:57 +0800 Subject: ext4: check the extent status again before inserting delalloc block ext4_da_map_blocks looks up for any extent entry in the extent status tree (w/o i_data_sem) and then the looks up for any ondisk extent mapping (with i_data_sem in read mode). If it finds a hole in the extent status tree or if it couldn't find any entry at all, it then takes the i_data_sem in write mode to add a da entry into the extent status tree. This can actually race with page mkwrite & fallocate path. Note that this is ok between 1. ext4 buffered-write path v/s ext4_page_mkwrite(), because of the folio lock 2. ext4 buffered write path v/s ext4 fallocate because of the inode lock. But this can race between ext4_page_mkwrite() & ext4 fallocate path ext4_page_mkwrite() ext4_fallocate() block_page_mkwrite() ext4_da_map_blocks() //find hole in extent status tree ext4_alloc_file_blocks() ext4_map_blocks() //allocate block and unwritten extent ext4_insert_delayed_block() ext4_da_reserve_space() //reserve one more block ext4_es_insert_delayed_block() //drop unwritten extent and add delayed extent by mistake Then, the delalloc extent is wrong until writeback and the extra reserved block can't be released any more and it triggers below warning: EXT4-fs (pmem2): Inode 13 (00000000bbbd4d23): i_reserved_data_blocks(1) not cleared! Fix the problem by looking up extent status tree again while the i_data_sem is held in write mode. If it still can't find any entry, then we insert a new da entry into the extent status tree. Cc: stable@vger.kernel.org Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 168819b4db01..4b0d64a76e88 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1737,6 +1737,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, if (ext4_es_is_hole(&es)) goto add_delayed; +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. @@ -1781,6 +1782,26 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, add_delayed: down_write(&EXT4_I(inode)->i_data_sem); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval; + } + } + retval = ext4_insert_delayed_block(inode, map->m_lblk); up_write(&EXT4_I(inode)->i_data_sem); if (retval) -- cgit v1.2.3 From 14a210c110d1ffbef4ed56e88e3c34de04790ff8 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:39:59 +0800 Subject: ext4: trim delalloc extent In ext4_da_map_blocks(), we could find four kind of extents in the extent status tree: hole, unwritten, written and delayed extent. Now we only trim the map len if we found an unwritten extent or a written extent. This is okay now since map->m_len is always set to one and we always insert one delayed block at a time. But this will become isn't okay for other two cases if ext4_insert_delayed_block() and ext4_da_map_blocks() support inserting multiple map->len blocks later. 1. If we found a hole in the extent status tree which es->es_len is shorter than the length we want to write, we should trim the map->m_len to prevent adding extra delay more blocks than we expected. For example, assume we write data [A, C) to a file that contains a hole extent [A, B) and a written extent [B, D) in the cache. A B C D before da write: ...hhhhhh|wwwwww.... Then we will get extent [A, B), we should trim map->m_len to B-A before inserting new delalloc blocks, if not, the range [B, C) will be duplicated. 2. If we found a delayed extent in the extent status tree which es->es_len is shorter than the length we want to write, we should trim the map->m_len to es->es_len and return directly since the front part of this map has been delayed, we can't insert the delalloc extent that contains the latter part in this round, we should return the delayed length and the caller should increase the position and call ext4_da_map_blocks() again. For example, assume we write data [A, C) to a file that contains a delayed extent [A, B) in the cache. A B C before da write: ...dddddd|hhh.... Then we will get delayed extent [A, B), we should also trim map->m_len to B-A and return, if not, we will incorrectly assume that the write is complete and won't insert [B, C). So we need to always trim the map->m_len if the found es->es_len in the extent status tree is shorter than the map->m_len, prearing for inserting a extent with multiple delalloc blocks. This patch only does a pre-fix, the handle is crude and ext4_da_map_blocks() deserve a cleanup, we will do that later. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b0d64a76e88..200bbe89ded2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1734,6 +1734,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_hole(&es)) goto add_delayed; @@ -1750,10 +1755,6 @@ found: } map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1790,6 +1791,11 @@ add_delayed: * the extent status tree. */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; -- cgit v1.2.3 From bb6b18057f18e9b7f53a524721060652de151e8a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:00 +0800 Subject: ext4: drop iblock parameter The start block of the delalloc extent to be inserted is equal to map->m_lblk, just drop the duplicate iblock input parameter. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/20240517124005.347221-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 200bbe89ded2..20528a04903a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1712,8 +1712,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * time. This function looks up the requested blocks and sets the * buffer delay bit under the protection of i_data_sem. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map, struct buffer_head *bh) { struct extent_status es; @@ -1733,8 +1732,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - retval = es.es_len - (iblock - es.es_lblk); + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; @@ -1754,7 +1753,7 @@ found: return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1790,8 +1789,8 @@ add_delayed: * is held in write mode, before inserting a new da entry in * the extent status tree. */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - retval = es.es_len - (iblock - es.es_lblk); + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; @@ -1848,7 +1847,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); + ret = ext4_da_map_blocks(inode, &map, bh); if (ret <= 0) return ret; -- cgit v1.2.3 From 12eba993b94c9186e44a01f46e38b3ab3c635f2d Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:01 +0800 Subject: ext4: make ext4_es_insert_delayed_block() insert multi-blocks Rename ext4_es_insert_delayed_block() to ext4_es_insert_delayed_extent() and pass length parameter to make it insert multiple delalloc blocks at a time. For the case of bigalloc, split the allocated parameter to lclu_allocated and end_allocated. lclu_allocated indicates the allocation state of the cluster which is containing the lblk, end_allocated indicates the allocation state of the extent end, clusters in the middle of delay allocated extent must be unallocated. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 20528a04903a..c55f1607b7d8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1702,7 +1702,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, 1, allocated, false); return 0; } -- cgit v1.2.3 From 0d66b23d79c750276f791411d81a524549a64852 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:02 +0800 Subject: ext4: make ext4_da_reserve_space() reserve multi-clusters Add 'nr_resv' parameter to ext4_da_reserve_space(), which indicates the number of clusters wants to reserve, make it reserve multiple clusters at a time. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c55f1607b7d8..8e5815b30b5f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1479,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1492,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1678,7 +1678,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, 1); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ @@ -1690,7 +1690,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) if (ret < 0) return ret; if (ret == 0) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, 1); if (ret != 0) /* ENOSPC */ return ret; } else { -- cgit v1.2.3 From 49bf6ab4d30b7a39d86a585e0a58f6c449d2e009 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:03 +0800 Subject: ext4: factor out a helper to check the cluster allocation state Factor out a common helper ext4_clu_alloc_state(), check whether the cluster containing a delalloc block to be added has been allocated or has delalloc reservation, no logic changes. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8e5815b30b5f..4d8c9ab1ddb1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1649,6 +1649,35 @@ static void ext4_print_free_blocks(struct inode *inode) return; } +/* + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1682,23 +1711,15 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode, 1); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; - } + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret == 2) + allocated = true; + if (ret == 0) { + ret = ext4_da_reserve_space(inode, 1); + if (ret != 0) /* ENOSPC */ + return ret; } } -- cgit v1.2.3 From 1850d76c1b781ad9c7dc3c4968fb40c1915231c0 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:04 +0800 Subject: ext4: make ext4_insert_delayed_block() insert multi-blocks Rename ext4_insert_delayed_block() to ext4_insert_delayed_blocks(), pass length parameter to make it insert multiple delalloc blocks at a time. For non-bigalloc case, just reserve len blocks and insert delalloc extent. For bigalloc case, we can ensure that the clusters in the middle of a extent must be unallocated, we only need to check whether the start and end clusters are delayed/allocated. We should subtract the space for the start and/or end block(s) if they are allocated. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 51 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4d8c9ab1ddb1..91eb3ba68970 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1679,24 +1679,29 @@ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1707,23 +1712,39 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode, 1); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; - if (ret == 2) - allocated = true; - if (ret == 0) { - ret = ext4_da_reserve_space(inode, 1); + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); + } + } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } - ext4_es_insert_delayed_extent(inode, lblk, 1, allocated, false); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } @@ -1828,7 +1849,7 @@ add_delayed: } } - retval = ext4_insert_delayed_block(inode, map->m_lblk); + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); if (retval) return retval; -- cgit v1.2.3 From 8262fe9a902c8a7b68c8521ebe18360a9145bada Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 17 May 2024 20:40:05 +0800 Subject: ext4: make ext4_da_map_blocks() buffer_head unaware After calling the ext4_da_map_blocks(), a delalloc extent state could be identified through the EXT4_MAP_DELAYED flag in map. So factor out buffer_head related handles in ext4_da_map_blocks(), make this function buffer_head unaware and becomes a common helper, and also update the stale function commtents, preparing for the iomap da write path in the future. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240517124005.347221-11-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 63 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 91eb3ba68970..c81cf67d8e77 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1749,36 +1749,32 @@ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - retval = es.es_len - (map->m_lblk - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; @@ -1788,10 +1784,8 @@ found: * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } @@ -1806,7 +1800,7 @@ found: #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1820,7 +1814,7 @@ found: retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); if (retval) - return retval; + return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); @@ -1832,10 +1826,8 @@ add_delayed: * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - retval = es.es_len - (map->m_lblk - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); @@ -1845,18 +1837,14 @@ add_delayed: retval = ext4_map_query_blocks(NULL, inode, map); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); - return retval; + return retval < 0 ? retval : 0; } } + map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); - if (retval) - return retval; - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); return retval; } @@ -1876,11 +1864,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1889,10 +1881,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); -- cgit v1.2.3 From 83f4414b8f84249d538905825b088ff3ae555652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20G=C5=82adysz?= Date: Wed, 3 Jul 2024 09:01:12 +0200 Subject: ext4: sanity check for NULL pointer after ext4_force_shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test case: 2 threads write short inline data to a file. In ext4_page_mkwrite the resulting inline data is converted. Handling ext4_grp_locked_error with description "block bitmap and bg descriptor inconsistent: X vs Y free clusters" calls ext4_force_shutdown. The conversion clears EXT4_STATE_MAY_INLINE_DATA but fails for ext4_destroy_inline_data_nolock and ext4_mark_iloc_dirty due to ext4_forced_shutdown. The restoration of inline data fails for the same reason not setting EXT4_STATE_MAY_INLINE_DATA. Without the flag set a regular process path in ext4_da_write_end follows trying to dereference page folio private pointer that has not been set. The fix calls early return with -EIO error shall the pointer to private be NULL. Sample crash report: Unable to handle kernel paging request at virtual address dfff800000000004 KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027] Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [dfff800000000004] address between user and kernel address ranges Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 20274 Comm: syz-executor185 Not tainted 6.9.0-rc7-syzkaller-gfda5695d692c #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __block_commit_write+0x64/0x2b0 fs/buffer.c:2167 lr : __block_commit_write+0x3c/0x2b0 fs/buffer.c:2160 sp : ffff8000a1957600 x29: ffff8000a1957610 x28: dfff800000000000 x27: ffff0000e30e34b0 x26: 0000000000000000 x25: dfff800000000000 x24: dfff800000000000 x23: fffffdffc397c9e0 x22: 0000000000000020 x21: 0000000000000020 x20: 0000000000000040 x19: fffffdffc397c9c0 x18: 1fffe000367bd196 x17: ffff80008eead000 x16: ffff80008ae89e3c x15: 00000000200000c0 x14: 1fffe0001cbe4e04 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000ff0100 x9 : 0000000000000000 x8 : 0000000000000004 x7 : 0000000000000000 x6 : 0000000000000000 x5 : fffffdffc397c9c0 x4 : 0000000000000020 x3 : 0000000000000020 x2 : 0000000000000040 x1 : 0000000000000020 x0 : fffffdffc397c9c0 Call trace: __block_commit_write+0x64/0x2b0 fs/buffer.c:2167 block_write_end+0xb4/0x104 fs/buffer.c:2253 ext4_da_do_write_end fs/ext4/inode.c:2955 [inline] ext4_da_write_end+0x2c4/0xa40 fs/ext4/inode.c:3028 generic_perform_write+0x394/0x588 mm/filemap.c:3985 ext4_buffered_write_iter+0x2c0/0x4ec fs/ext4/file.c:299 ext4_file_write_iter+0x188/0x1780 call_write_iter include/linux/fs.h:2110 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0x968/0xc3c fs/read_write.c:590 ksys_write+0x15c/0x26c fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __arm64_sys_write+0x7c/0x90 fs/read_write.c:652 __invoke_syscall arch/arm64/kernel/syscall.c:34 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:48 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:133 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:152 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Code: 97f85911 f94002da 91008356 d343fec8 (38796908) ---[ end trace 0000000000000000 ]--- ---------------- Code disassembly (best guess): 0: 97f85911 bl 0xffffffffffe16444 4: f94002da ldr x26, [x22] 8: 91008356 add x22, x26, #0x20 c: d343fec8 lsr x8, x22, #3 * 10: 38796908 ldrb w8, [x8, x25] <-- trapping instruction Reported-by: syzbot+18df508cf00a0598d9a6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18df508cf00a0598d9a6 Link: https://lore.kernel.org/all/000000000000f19a1406109eb5c5@google.com/T/ Signed-off-by: Wojciech Gładysz Link: https://patch.msgid.link/20240703070112.10235-1-wojciech.gladysz@infogain.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c81cf67d8e77..941c1c0d5c6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3019,6 +3019,11 @@ static int ext4_da_do_write_end(struct address_space *mapping, bool disksize_changed = false; loff_t new_i_size; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. -- cgit v1.2.3