// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_refcount.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_bit.h"
#include "xfs_alloc.h"
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
#include "xfs_sb.h"
#include "xfs_ag_resv.h"
/*
* Copy on Write of Shared Blocks
*
* XFS must preserve "the usual" file semantics even when two files share
* the same physical blocks. This means that a write to one file must not
* alter the blocks in a different file; the way that we'll do that is
* through the use of a copy-on-write mechanism. At a high level, that
* means that when we want to write to a shared block, we allocate a new
* block, write the data to the new block, and if that succeeds we map the
* new block into the file.
*
* XFS provides a "delayed allocation" mechanism that defers the allocation
* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
* possible. This reduces fragmentation by enabling the filesystem to ask
* for bigger chunks less often, which is exactly what we want for CoW.
*
* The delalloc mechanism begins when the kernel wants to make a block
* writable (write_begin or page_mkwrite). If the offset is not mapped, we
* create a delalloc mapping, which is a regular in-core extent, but without
* a real startblock. (For delalloc mappings, the startblock encodes both
* a flag that this is a delalloc mapping, and a worst-case estimate of how
* many blocks might be required to put the mapping into the BMBT.) delalloc
* mappings are a reservation against the free space in the filesystem;
* adjacent mappings can also be combined into fewer larger mappings.
*
* As an optimization, the CoW extent size hint (cowextsz) creates
* outsized aligned delalloc reservations in the hope of landing out of
* order nearby CoW writes in a single extent on disk, thereby reducing
* fragmentation and improving future performance.
*
* D: --RRRRRRSSSRRRRRRRR--- (data fork)
* C: ------DDDDDDD--------- (CoW fork)
*
* When dirty pages are being written out (typically in writepage), the
* delalloc reservations are converted into unwritten mappings by
* allocating blocks and replacing the delalloc mapping with real ones.
* A delalloc mapping can be replaced by several unwritten ones if the
* free space is fragmented.
*
* D: --RRRRRRSSSRRRRRRRR---
* C: ------UUUUUUU---------
*
* We want to adapt the delalloc mechanism for copy-on-write, since the
* write paths are similar. The first two steps (creating the reservation
* and allocating the blocks) are exactly the same as delalloc except that
* the mappings must be stored in a separate CoW fork because we do not want
* to disturb the mapping in the data fork until we're sure that the write
* succeeded. IO completion in this case is the process of removing the old
* mapping from the data fork and moving the new mapping from the CoW fork to
* the data fork. This will be discussed shortly.
*
* For now, unaligned directio writes will be bounced back to the page cache.
* Block-aligned directio writes will use the same mechanism as buffered
* writes.
*
* Just prior to submitting the actual disk write requests, we convert
* the extents representing the range of the file actually being written
* (as opposed to extra pieces created for the cowextsize hint) to real
* extents. This will become important in the next step:
*
* D: --RRRRRRSSSRRRRRRRR---
* C: ------UUrrUUU---------
*
* CoW remapping must be done after the data block write completes,
* because we don't want to destroy the old data fork map until we're sure
* the new block has been written. Since the new mappings are kept in a
* separate fork, we can simply iterate these mappings to find the ones
* that cover the file blocks that we just CoW'd. For each extent, simply
* unmap the corresponding range in the data fork, map the new range into
* the data fork, and remove the extent from the CoW fork. Because of
* the presence of the cowextsize hint, however, we must be careful
* only to remap the blocks that we've actually written out -- we must
* never remap delalloc reservations nor CoW staging blocks that have
* yet to be written. This corresponds exactly to the real extents in
* the CoW fork:
*
* D: --RRRRRRrrSRRRRRRRR---
* C: ------UU--UUU---------
*
* Since the remapping operation can be applied to an arbitrary file
* range, we record the need for the remap step as a flag in the ioend
* instead of declaring a new IO type. This is required for direct io
* because we only have ioend for the whole dio, and we have to be able to
* remember the presence of unwritten blocks and CoW blocks with a single
* ioend structure. Better yet, the more ground we can cover with one
* ioend, the better.
*/
/*
* Given an AG extent, find the lowest-numbered run of shared blocks
* within that range and return the range in fbno/flen. If
* find_end_of_shared is true, return the longest contiguous extent of
* shared blocks. If there are no shared extents, fbno and flen will
* be set to NULLAGBLOCK and 0, respectively.
*/
int
xfs_reflink_find_shared(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
xfs_agblock_t *fbno,
xfs_extlen_t *flen,
bool find_end_of_shared)
{
struct xfs_buf *agbp;
struct xfs_btree_cur *cur;
int error;
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
return error;
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
return error;
}
/*
* Trim the mapping to the next block where there's a change in the
* shared/unshared status. More specifically, this means that we
* find the lowest-numbered extent of shared blocks that coincides with
* the given block mapping. If the shared extent overlaps the start of
* the mapping, trim the mapping to the end of the shared extent. If
* the shared region intersects the mapping, trim the mapping to the
* start of the shared extent. If there are no shared regions that
* overlap, just return the original extent.
*/
int
xfs_reflink_trim_around_shared(
struct xfs_inode *ip,
struct xfs_bmbt_irec *irec,
bool *shared)
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_extlen_t aglen;
xfs_agblock_t fbno;
xfs_extlen_t flen;
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
trace_xfs_reflink_trim_around_shared(ip, irec);
agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
aglen = irec->br_blockcount;
error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
aglen, &fbno, &flen, true);
if (error)
return error;
*shared = false;
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
} else if (fbno == agbno) {
/*
* The start of this extent is shared. Truncate the
* mapping at the end of the shared region s
|