// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
#include "xfs_trace.h"
/*
* Implement Garbage Collection (GC) of partially used zoned.
*
* To support the purely sequential writes in each zone, zoned XFS needs to be
* able to move data remaining in a zone out of it to reset the zone to prepare
* for writing to it again.
*
* This is done by the GC thread implemented in this file. To support that a
* number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
* write the garbage collected data into.
*
* Whenever the available space is below the chosen threshold, the GC thread
* looks for potential non-empty but not fully used zones that are worth
* reclaiming. Once found the rmap for the victim zone is queried, and after
* a bit of sorting to reduce fragmentation, the still live extents are read
* into memory and written to the GC target zone, and the bmap btree of the
* files is updated to point to the new location. To avoid taking the IOLOCK
* and MMAPLOCK for the entire GC process and thus affecting the latency of
* user reads and writes to the files, the GC writes are speculative and the
* I/O completion checks that no other writes happened for the affected regions
* before remapping.
*
* Once a zone does not contain any valid data, be that through GC or user
* block removal, it is queued for for a zone reset. The reset operation
* carefully ensures that the RT device cache is flushed and all transactions
* referencing the rmap have been committed to disk.
*/
/*
* Size of each GC scratch pad. This is also the upper bound for each
* GC I/O, which helps to keep latency down.
*/
#define XFS_GC_CHUNK_SIZE SZ_1M
/*
* Scratchpad data to read GCed data into.
*
* The offset member tracks where the next allocation starts, and freed tracks
* the amount of space that is not used anymore.
*/
#define XFS_ZONE_GC_NR_SCRATCH 2
struct xfs_zone_scratch {
struct folio *folio;
unsigned int offset;
unsigned int freed;
};
/*
* Chunk that is read and written for each GC operation.
*
* Note that for writes to actual zoned devices, the chunk can be split when
* reaching the hardware limit.
*/
struct xfs_gc_bio {
struct xfs_zone_gc_data *data;
/*
* Entry into the reading/writing/resetting list. Only accessed from
* the GC thread, so no locking needed.
*/
struct list_head entry;
/*
* State of this gc_bio. Done means the current I/O completed.
* Set from the bio end I/O handler, read from the GC thread.
*/
enum {
XFS_GC_BIO_NEW,
XFS_GC_BIO_DONE,
} state;
/*
* Pointer to the inode and byte range in the inode that this
* GC chunk is operating on.
*/
struct xfs_inode *ip;
loff_t offset;
unsigned int len;
/*
* Existing startblock (in the zone to be freed) and newly assigned
* daddr in the zone GCed into.
*/
xfs_fsblock_t old_startblock;
xfs_daddr_t new_daddr;
struct xfs_zone_scratch *scratch;
/* Are we writing to a sequential write required zone? */
bool is_seq;
/* Open Zone being written to */
struct xfs_open_zone *oz;
/* Bio used for reads and writes, including the bvec used by it */
struct bio_vec bv;
struct bio bio; /* must be last */
};
#define XFS_ZONE_GC_RECS 1024
/* iterator, needs to be reinitialized for each victim zone */
struct xfs_zone_gc_iter {
struct xfs_rtgroup *victim_rtg;
unsigned int rec_count;
unsigned int rec_idx;
xfs_agblock_t next_startblock;
struct xfs_rmap_irec *recs;
};
/*
* Per-mount GC state.
*/
struct xfs_zone_gc_data {
struct xfs_mount *mp;
/* bioset used to allocate the gc_bios */
struct bio_set bio_set;
/*
* Scratchpad used, and index to indicated which one is used.
*/
struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
unsigned int scratch_idx;
/*
* List of bios currently being read, written and reset.
* These lists are only accessed by the GC thread itself, and must only
* be processed in order.
*/
struct list_head reading;
struct list_head writing;
struct list_head resetting;
/*
* Iterator for the victim zone.
*/
struct xfs_zone_gc_iter iter;
};
/*
* We aim to keep enough zones free in stock to fully use the open zone limit
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
* can be set to make sure a fraction of the unused blocks are available for
* writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
s64 available, free, threshold;
s32 remainder;
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
return false;
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
if (available <
mp->m_groups[XG_TYPE_RTG].blocks *
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
threshold = div_s64_rem(free, 100, &remainder);
threshold = threshold * mp->m_zonegc_low_space +
remainder * div_s64(mp->m_zonegc_low_space, 100);
if (available < threshold)
return true;
return false;
}
static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(
struct xfs_mount *mp)
{
struct xfs_zone_gc_data *data;
int i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return