From 0b956de1512e23329aa27f60fe4a8b3e6afc7d6a Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:40 +0530 Subject: ext4: kill unused function ext4_journalled_write_inline_data Commit 3f079114bf522 ("ext4: Convert data=journal writeback to use ext4_writepages()") Added support for writeback of journalled data into ext4_writepages() and killed function __ext4_journalled_writepage() which used to call ext4_journalled_write_inline_data() for inline data. This function got left over by mistake. Hence kill it's definition as no one uses it. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/122b2a8d5e0650686f23ed6da26ed9e04105562b.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- fs/ext4/inline.c | 24 ------------------------ 2 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8104a21b001a..15abcbe988ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3484,10 +3484,6 @@ extern int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 5854bd5a3352..c0b2dc6514b2 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -823,30 +823,6 @@ out: return ret ? ret : copied; } -struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - int ret, no_expand; - void *kaddr; - struct ext4_iloc iloc; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return NULL; - } - - ext4_write_lock_xattr(inode, &no_expand); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, 0, len); - kunmap_atomic(kaddr); - ext4_write_unlock_xattr(inode, &no_expand); - - return iloc.bh; -} - /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: -- cgit v1.2.3 From 36c9b4504088089185f7743433c914935b518ab2 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:42 +0530 Subject: ext4: Change remaining tracepoints to use folio ext4_readpage() is converted to ext4_read_folio() hence change the related tracepoint from trace_ext4_readpage(page) to trace_ext4_read_folio(folio). Do the same for trace_ext4_releasepage(page) to trace_ext4_release_folio(folio) As a minor bit of optimization to avoid an extra dereferencing, since both of the above functions already were dereferencing folio->mapping->host, hence change the tracepoint argument to take (inode, folio). Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/caba2b3c0147bed4ea7706767dc1d19cd0e29ab0.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02de439bf1f0..e76a95267178 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3105,7 +3105,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) int ret = -EAGAIN; struct inode *inode = folio->mapping->host; - trace_ext4_readpage(&folio->page); + trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); @@ -3164,9 +3164,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio, static bool ext4_release_folio(struct folio *folio, gfp_t wait) { - journal_t *journal = EXT4_JOURNAL(folio->mapping->host); + struct inode *inode = folio->mapping->host; + journal_t *journal = EXT4_JOURNAL(inode); - trace_ext4_releasepage(&folio->page); + trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) -- cgit v1.2.3 From 80be8c5cc9251484ba856e2cf7d15a189326fe9c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:43 +0530 Subject: ext4: Make mpage_journal_page_buffers use folio This patch converts mpage_journal_page_buffers() to use folio and also removes the PAGE_SIZE assumption. Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/ebc3ac80e6a54b53327740d010ce684a83512021.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e76a95267178..0ff4624178cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2321,11 +2321,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -static int ext4_journal_page_buffers(handle_t *handle, struct page *page, - int len) +static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, + size_t len) { - struct buffer_head *page_bufs = page_buffers(page); - struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs = folio_buffers(folio); + struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, @@ -2334,7 +2334,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, NULL, write_end_fn); if (ret == 0) ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -2344,22 +2344,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, - struct page *page) + struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); - int len; + size_t len = folio_size(folio); - ClearPageChecked(page); + folio_clear_checked(folio); mpd->wbc->nr_to_write--; - if (page->index == size >> PAGE_SHIFT && + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = size - folio_pos(folio); - return ext4_journal_page_buffers(handle, page, len); + return ext4_journal_folio_buffers(handle, folio, len); } /* @@ -2499,7 +2497,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, - mpd, &folio->page); + mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; @@ -6157,7 +6155,7 @@ retry_alloc: err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_journal_page_buffers(handle, &folio->page, len)) + if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); -- cgit v1.2.3 From d19500da4b8cfe1791e1583ab237c3213e1bd39c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:44 +0530 Subject: ext4: Make ext4_write_inline_data_end() use folio ext4_write_inline_data_end() is completely converted to work with folio. Also all callers of ext4_write_inline_data_end() already works on folio except ext4_da_write_end(). Mostly for consistency and saving few instructions maybe, this patch just converts ext4_da_write_end() to work with folio which makes the last caller of ext4_write_inline_data_end() also converted to work with folio. We then make ext4_write_inline_data_end() take folio instead of page. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/1bcea771720ff451a5a59b3f1bcd5fae51cb7ce7.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ++---- fs/ext4/inline.c | 3 +-- fs/ext4/inode.c | 23 ++++++++++++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 15abcbe988ec..ec9cd6252185 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3480,10 +3480,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0b2dc6514b2..f48183f91c87 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -741,9 +741,8 @@ convert: } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) + unsigned copied, struct folio *folio) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0ff4624178cd..c2868282ad81 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1287,7 +1287,8 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* @@ -1395,7 +1396,8 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; @@ -2942,15 +2944,15 @@ retry: * Check if we should update i_disksize * when write to the end of file but not require block allocation */ -static int ext4_da_should_update_i_disksize(struct page *page, +static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned int idx; int i; - bh = page_buffers(page); + bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) @@ -2970,17 +2972,19 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; + struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, page, fsdata); + len, copied, &folio->page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; @@ -3004,10 +3008,11 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(page, end)) + ext4_da_should_update_i_disksize(folio, end)) ext4_update_i_disksize(inode, new_i_size); - return generic_write_end(file, mapping, pos, len, copied, page, fsdata); + return generic_write_end(file, mapping, pos, len, copied, &folio->page, + fsdata); } /* -- cgit v1.2.3 From 0dea40aa315d01f5d0ffbd871483ea8a45c71f65 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 May 2023 20:27:13 +0100 Subject: ext4: Call fsverity_verify_folio() Now that fsverity supports working on entire folios, call fsverity_verify_folio() instead of fsverity_verify_page() Reported-by: Eric Biggers Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Eric Biggers Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230516192713.1070469-1-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 6f46823fba61..3e7d160f543f 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && - !fsverity_verify_page(&folio->page)) + !fsverity_verify_folio(folio)) goto set_error_page; folio_mark_uptodate(folio); folio_unlock(folio); -- cgit v1.2.3 From b3916da0d9636285c8a881802956631f9e2ebb5b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:09 +0800 Subject: ext4: fix wrong unit use in ext4_mb_normalize_request NRL_CHECK_SIZE will compare input req and size, so req and size should be in same unit. Input req "fe_len" is in cluster unit while input size "(8<<20)>>bsbits" is in block unit. Convert "fe_len" to block unit to fix the mismatch. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20f67a260df5..7d88f76070be 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4283,7 +4283,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; -- cgit v1.2.3 From 497885f72d930305d8e61b6b616b22b4da1adf90 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:10 +0800 Subject: ext4: fix unit mismatch in ext4_mb_new_blocks_simple The "i" returned from mb_find_next_zero_bit is in cluster unit and we need offset "block" corresponding to "i" in block unit. Convert "i" to block unit to fix the unit mismatch. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7d88f76070be..d8caef7cd9d0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6011,6 +6011,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); @@ -6039,7 +6040,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) { + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; @@ -6056,7 +6058,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, return 0; } - block = ext4_group_first_block_no(sb, group) + i; + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, 1); ar->len = 1; -- cgit v1.2.3 From 99c515e3a860576ba90c11acbc1d6488dfca6463 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:11 +0800 Subject: ext4: fix wrong unit use in ext4_mb_find_by_goal We need start in block unit while fe_start is in cluster unit. Use ext4_grp_offs_to_block helper to convert fe_start to get start in block unit. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d8caef7cd9d0..f6dc4f276ca4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2210,8 +2210,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + - ex.fe_start; + start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; -- cgit v1.2.3 From c3defd99d58cbdd132bd197714e5523dac976b66 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:12 +0800 Subject: ext4: treat stripe in block unit Stripe is misused in block unit and in cluster unit in different code paths. User awared of stripe maybe not awared of bigalloc feature, so treat stripe only in block unit to fix this. Besides, it's hard to get stripe aligned blocks (start and length are both aligned with stripe) if stripe is not aligned with cluster, just disable stripe and alert user in this case to simpfy the code and avoid unnecessary work to get stripe aligned blocks which likely to be failed. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 +++++++++++------- fs/ext4/super.c | 13 +++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f6dc4f276ca4..7ef95bdde91d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2207,7 +2207,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ - if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + if (max >= ac->ac_g_ex.fe_len && + ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2372,7 +2373,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; - ext4_grpblk_t i; + ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); @@ -2384,10 +2385,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; + stripe = EXT4_B2C(sbi, sbi->s_stripe); + i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); - if (max >= sbi->s_stripe) { + max = mb_find_extent(e4b, i, stripe, &ex); + if (max >= stripe) { ac->ac_found++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; @@ -2395,7 +2398,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, break; } } - i += sbi->s_stripe; + i += stripe; } } @@ -2758,7 +2761,8 @@ repeat: if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && - !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + !(ac->ac_g_ex.fe_len % + EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -3477,7 +3481,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, sbi->s_stripe); + sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 05fcecc36244..39591fb78c1d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5297,6 +5297,19 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); + /* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simpfy code and avoid + * stripe aligned allocation which will rarely successes. + */ + if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && + sbi->s_stripe % sbi->s_cluster_ratio != 0) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + sbi->s_stripe, sbi->s_cluster_ratio); + sbi->s_stripe = 0; + } sbi->s_extent_max_zeroout_kb = 32; /* -- cgit v1.2.3 From 1eff590489a213a213c57d96b86f48b32cdf8c3a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:13 +0800 Subject: ext4: add EXT4_MB_HINT_GOAL_ONLY test in ext4_mb_use_preallocated ext4_mb_use_preallocated will ignore the demand to alloc goal blocks, although the EXT4_MB_HINT_GOAL_ONLY is requested. For group pa, ext4_mb_group_or_file will not set EXT4_MB_HINT_GROUP_ALLOC if EXT4_MB_HINT_GOAL_ONLY is set. So we will not alloc goal blocks from group pa if EXT4_MB_HINT_GOAL_ONLY is set. For inode pa, ext4_mb_pa_goal_check is added to check if free extent in found inode pa meets goal blocks when EXT4_MB_HINT_GOAL_ONLY is set. Signed-off-by: Kemeng Shi Suggested-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7ef95bdde91d..839db9c46aea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4531,6 +4531,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, return pa; } +/* + * check if found pa meets EXT4_MB_HINT_GOAL_ONLY + */ +static bool +ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + + if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) + return true; + + /* + * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted + * in ext4_mb_normalize_request and will keep same with ac_o_ex + * from ext4_mb_initialize_context. Choose ac_g_ex here to keep + * consistent with ext4_mb_find_by_goal. + */ + start = pa->pa_pstart + + (ac->ac_g_ex.fe_logical - pa->pa_lstart); + if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) + return false; + + if (ac->ac_g_ex.fe_len > pa->pa_len - + EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) + return false; + + return true; +} + /* * search goal blocks in preallocated space */ @@ -4581,7 +4612,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* found preallocated blocks, use them */ spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && + likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); -- cgit v1.2.3 From 95a4c3c7e0342f553fed1ac70d10c2efc6bba3b1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:14 +0800 Subject: ext4: remove ext4_block_group and ext4_block_group_offset declaration For ext4_block_group and ext4_block_group_offset, there are only declaration without definition. Just remove them. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ec9cd6252185..e6d80325718d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2632,10 +2632,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb, extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); -extern unsigned int ext4_block_group(struct super_block *sb, - ext4_fsblk_t blocknr); -extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, - ext4_fsblk_t blocknr); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -- cgit v1.2.3 From 19a043bb1fd1b5cb2652ca33536c55e6c0a70df0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:15 +0800 Subject: ext4: try all groups in ext4_mb_new_blocks_simple ext4_mb_new_blocks_simple ignores the group before goal, so it will fail if free blocks reside in group before goal. Try all groups to avoid unexpected failure. Search finishes either if any free block is found or if no available blocks are found. Simpliy check "i >= max" to distinguish the above cases. Signed-off-by: Kemeng Shi Suggested-by: Theodore Ts'o Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 839db9c46aea..bcc9622daa4a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6047,7 +6047,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t group; + ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; @@ -6061,7 +6061,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); - for (; group < ext4_get_groups_count(sb); group++) { + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); @@ -6085,10 +6085,13 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, if (i < max) break; + if (++group >= ext4_get_groups_count(sb)) + group = 0; + blkoff = 0; } - if (group >= ext4_get_groups_count(sb) || i >= max) { + if (i >= max) { *errp = -ENOSPC; return 0; } -- cgit v1.2.3 From 11b6890be0084ad4df0e06d89a9fdcc948472c65 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:16 +0800 Subject: ext4: get block from bh in ext4_free_blocks for fast commit replay ext4_free_blocks will retrieve block from bh if block parameter is zero. Retrieve block before ext4_free_blocks_simple to avoid potentially passing wrong block to ext4_free_blocks_simple. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bcc9622daa4a..dadda9e8cf29 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6368,12 +6368,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); - if (sbi->s_mount_state & EXT4_FC_REPLAY) { - ext4_free_blocks_simple(inode, block, count); - return; - } - - might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -6381,6 +6375,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, block = bh->b_blocknr; } + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + ext4_free_blocks_simple(inode, block, count); + return; + } + + might_sleep(); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " -- cgit v1.2.3 From ad78b5efe4246e5deba8d44a6ed172b8a00d3113 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:17 +0800 Subject: ext4: remove unused parameter from ext4_mb_new_blocks_simple() Two cleanups for ext4_mb_new_blocks_simple: Remove unused parameter handle of ext4_mb_new_blocks_simple. Move ext4_mb_new_blocks_simple definition before ext4_mb_new_blocks to remove unnecessary forward declaration of ext4_mb_new_blocks_simple. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 137 ++++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dadda9e8cf29..1345c4e84cc3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5786,8 +5786,72 @@ out_dbg: return ret; } -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp); +/* + * Simple allocator for Ext4 fast commit replay path. It searches for blocks + * linearly starting at the goal block and also excludes the blocks which + * are going to be in use after fast commit replay. + */ +static ext4_fsblk_t +ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group, nr; + ext4_grpblk_t blkoff; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_fsblk_t goal, block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + + ar->len = 0; + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + *errp = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return 0; + } + + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, + blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { + blkoff = i + 1; + } else + break; + } + brelse(bitmap_bh); + if (i < max) + break; + + if (++group >= ext4_get_groups_count(sb)) + group = 0; + + blkoff = 0; + } + + if (i >= max) { + *errp = -ENOSPC; + return 0; + } + + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); + ext4_mb_mark_bb(sb, block, 1, 1); + ar->len = 1; + + return block; +} /* * Main entry point into mballoc to allocate blocks @@ -5812,7 +5876,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) - return ext4_mb_new_blocks_simple(handle, ar, errp); + return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) @@ -6036,73 +6100,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_unlock(&sbi->s_md_lock); } -/* - * Simple allocator for Ext4 fast commit replay path. It searches for blocks - * linearly starting at the goal block and also excludes the blocks which - * are going to be in use after fast commit replay. - */ -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) -{ - struct buffer_head *bitmap_bh; - struct super_block *sb = ar->inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t group, nr; - ext4_grpblk_t blkoff; - ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); - ext4_grpblk_t i = 0; - ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - goal = ar->goal; - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - - ar->len = 0; - ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); - for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - *errp = PTR_ERR(bitmap_bh); - pr_warn("Failed to read block bitmap\n"); - return 0; - } - - while (1) { - i = mb_find_next_zero_bit(bitmap_bh->b_data, max, - blkoff); - if (i >= max) - break; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + - EXT4_C2B(sbi, i))) { - blkoff = i + 1; - } else - break; - } - brelse(bitmap_bh); - if (i < max) - break; - - if (++group >= ext4_get_groups_count(sb)) - group = 0; - - blkoff = 0; - } - - if (i >= max) { - *errp = -ENOSPC; - return 0; - } - - block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); - ar->len = 1; - - return block; -} - static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { -- cgit v1.2.3 From 247c3d214c23dfeeeb892e91a82ac1188bdaec9f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:18 +0800 Subject: ext4: fix wrong unit use in ext4_mb_clear_bb Function ext4_issue_discard need count in cluster. Pass count_clusters instead of count to fix the mismatch. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1345c4e84cc3..474aebfdc1dd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6280,8 +6280,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count, - NULL); + err = ext4_issue_discard(sb, block_group, bit, + count_clusters, NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" -- cgit v1.2.3 From 2ec6d0a5ea72689a79e6f725fd8b443a788ae279 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:19 +0800 Subject: ext4: fix wrong unit use in ext4_mb_new_blocks Function ext4_free_blocks_simple needs count in cluster. Function ext4_free_blocks accepts count in block. Convert count to cluster to fix the mismatch. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-12-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 474aebfdc1dd..4322eb7559fc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6373,7 +6373,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } if (sbi->s_mount_state & EXT4_FC_REPLAY) { - ext4_free_blocks_simple(inode, block, count); + ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } -- cgit v1.2.3 From 569f196f1e7a14472f21734170411f75a3179db0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 30 May 2023 18:03:40 +0530 Subject: ext4: mballoc: Remove useless setting of ac_criteria There will be changes coming in future patches which will introduce a new criteria for block allocation. This removes the useless setting of ac_criteria. AFAIU, this might be only used to differentiate between whether a preallocated blocks was allocated or was regular allocator called for allocating blocks. Hence this also adds the debug prints to identify what type of block allocation was done in ext4_mb_show_ac(). Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1dbae05617519cb6202f1b299c9d1be3e7cda763.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4322eb7559fc..8a3946f1bdd9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4617,7 +4617,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); - ac->ac_criteria = 10; read_unlock(&ei->i_prealloc_lock); return true; } @@ -4660,7 +4659,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } if (cpa) { ext4_mb_use_group_pa(ac, cpa); - ac->ac_criteria = 20; return true; } return false; @@ -5434,6 +5432,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); + mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + if (ac->ac_pa) + mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? + "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else -- cgit v1.2.3 From 5730cce35344fba94e2c329d2bc0a170333a059f Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 30 May 2023 18:03:41 +0530 Subject: ext4: Remove unused extern variables declaration ext4_mb_stats & ext4_mb_max_to_scan are never used. We use sbi->s_mb_stats and sbi->s_mb_max_to_scan instead. Hence kill these extern declarations. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/928b3142062172533b6d1b5a94de94700590fef3.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/mballoc.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e6d80325718d..2a7e254cbae3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2837,8 +2837,6 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; -extern long ext4_mb_stats; -extern long ext4_mb_max_to_scan; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6d85ee8674a6..24b666e558f1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -49,7 +49,7 @@ #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * with 'ext4_mb_stats' allocator will collect stats that will be + * with 's_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 -- cgit v1.2.3 From 4eb7a4a1a33bdaf259fca8528f2546c90ad18f0d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:42 +0530 Subject: ext4: Convert mballoc cr (criteria) to enum Convert criteria to be an enum so it easier to maintain and update the tracefiles to use enum names. This change also makes it easier to insert new criterias in the future. There is no functional change in this patch. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/5d82fd467bdf70ea45bdaef810af3b146013946c.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 23 +++++++++++-- fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++---------------------------- 2 files changed, 68 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2a7e254cbae3..914475cbb060 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -127,6 +127,23 @@ enum SHIFT_DIRECTION { SHIFT_RIGHT, }; +/* + * Number of criterias defined. For each criteria, mballoc has slightly + * different way of finding the required blocks nad usually, higher the + * criteria the slower the allocation. We start at lower criterias and keep + * falling back to higher ones if we are not able to find any blocks. + */ +#define EXT4_MB_NUM_CRS 4 +/* + * All possible allocation criterias for mballoc + */ +enum criteria { + CR0, + CR1, + CR2, + CR3, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -1544,9 +1561,9 @@ struct ext4_sb_info { atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_cr1_bad_suggestions; - atomic64_t s_bal_cX_groups_considered[4]; - atomic64_t s_bal_cX_hits[4]; - atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8a3946f1bdd9..4359280d2fa7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,19 +154,19 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR = 0, we look for groups which have the largest_free_order >= the order + * At CR0 , we look for groups which have the largest_free_order >= the order * of the request. We directly look at the largest free order list in the data * structure (1) above where largest_free_order = order of the request. If that * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * largest_free_order. This allows us to perform CR0 lookup in O(1) time. * - * At CR = 1, we only consider groups where average fragment size > request + * At CR1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or * equal to request size using our average fragment size group lists (data * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * linear order which requires O(N) search time for each CR0 and CR1 phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -409,7 +409,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr); + ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -859,7 +859,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter, *grp; @@ -884,8 +884,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { grp = iter; break; } @@ -897,7 +897,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = 1; + *new_cr = CR1; } else { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; @@ -909,7 +909,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL, *iter; @@ -932,8 +932,8 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], bb_avg_fragment_size_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR1]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR1))) { grp = iter; break; } @@ -947,7 +947,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { - *new_cr = 2; + *new_cr = CR2; } } @@ -955,7 +955,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= 2) + if (ac->ac_criteria >= CR2) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1000,7 +1000,7 @@ inc_and_return: * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; @@ -1009,9 +1009,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == 0) { + if (*new_cr == CR0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == 1) { + } else if (*new_cr == CR1) { ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); } else { /* @@ -2408,13 +2408,13 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < 0 || cr >= 4); + BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2428,7 +2428,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case 0: + case CR0: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2447,15 +2447,15 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case 1: + case CR1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case 2: + case CR2: if (free >= ac->ac_g_ex.fe_len) return true; break; - case 3: + case CR3: return true; default: BUG(); @@ -2476,7 +2476,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; @@ -2496,7 +2496,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR2 && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2511,7 +2511,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=0/1 is a very optimistic search to find large + /* cr=CR0/CR1 is a very optimistic search to find large * good chunks almost for free. If buddy data is not * ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, @@ -2519,7 +2519,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * and we want to make sure we locate metadata blocks * in the first block group in the flex_bg if possible. */ - if (cr < 2 && + if (cr < CR2 && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2625,7 +2625,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1, new_cr; + enum criteria cr, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2683,13 +2683,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? 0 : 1; + cr = ac->ac_2order ? CR0 : CR1; /* - * cr == 0 try to get exact allocation, - * cr == 3 try to get anything + * cr == CR0 try to get exact allocation, + * cr == CR3 try to get anything */ repeat: - for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start @@ -2716,7 +2716,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > 1 || + (cr > CR1 || prefetch_ios < sbi->s_mb_prefetch_limit)) { unsigned int curr_ios = prefetch_ios; @@ -2758,9 +2758,9 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == CR0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && sbi->s_stripe && + else if (cr == CR1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2801,7 +2801,7 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = 3; + cr = CR3; goto repeat; } } @@ -2926,36 +2926,36 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[0])); + atomic64_read(&sbi->s_bal_cX_failed[CR0])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr0_bad_suggestions)); seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[1])); + atomic64_read(&sbi->s_bal_cX_failed[CR1])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr1_bad_suggestions)); seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[2])); + atomic64_read(&sbi->s_bal_cX_failed[CR2])); seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[3])); + atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); -- cgit v1.2.3 From fdd9a00943a5f6687937628e01506dad697c9140 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:43 +0530 Subject: ext4: Add per CR extent scanned counter This gives better visibility into the number of extents scanned in each particular CR. For example, this information can be used to see how out block group scanning logic is performing when the BG is fragmented. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/55bb6d80f6e22ed2a5a830aa045572bdffc8b1b9.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 12 ++++++++++++ fs/ext4/mballoc.h | 1 + 3 files changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 914475cbb060..34ec65126ea7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1555,6 +1555,7 @@ struct ext4_sb_info { atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4359280d2fa7..e28731426ee5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2103,6 +2103,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first @@ -2277,6 +2278,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, break; } ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; @@ -2392,6 +2394,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); @@ -2929,6 +2932,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR0])); seq_printf(seq, "\t\tbad_suggestions: %u\n", @@ -2938,6 +2942,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR1])); seq_printf(seq, "\t\tbad_suggestions: %u\n", @@ -2947,6 +2952,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR2])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR2])); @@ -2954,6 +2960,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR3])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); @@ -4393,7 +4400,12 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + for (int i=0; iac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); + } + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 24b666e558f1..acfdc204e15d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -184,6 +184,7 @@ struct ext4_allocation_context { __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; __u16 ac_found; + __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; __u16 ac_buddy; __u8 ac_status; -- cgit v1.2.3 From 3ef5d263879696027c70548532a94418aad3bd95 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:44 +0530 Subject: ext4: Add counter to track successful allocation of goal length Track number of allocations where the length of blocks allocated is equal to the length of goal blocks (post normalization). This metric could be useful if making changes to the allocator logic in the future as it could give us visibility into how often do we trim our requests. PS: ac_b_ex.fe_len might get modified due to preallocation efforts and hence we use ac_f_ex.fe_len instead since we want to compare how much the allocator was able to actually find. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/343620e2be8a237239ea2613a7a866ee8607e973.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 34ec65126ea7..e4f53947f51f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1558,6 +1558,7 @@ struct ext4_sb_info { atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e28731426ee5..64d3760e4bcf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2965,6 +2965,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); @@ -4410,6 +4411,8 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); + if (ac->ac_f_ex.fe_len == ac->ac_g_ex.fe_len) + atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } -- cgit v1.2.3 From 1b420011210802a7a1b1e99f30bc1d62c352ac71 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:45 +0530 Subject: ext4: Avoid scanning smaller extents in BG during CR1 When we are inside ext4_mb_complex_scan_group() in CR1, we can be sure that this group has atleast 1 big enough continuous free extent to satisfy our request because (free / fragments) > goal length. Hence, instead of wasting time looping over smaller free e