// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_refcount.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_bit.h"
#include "xfs_alloc.h"
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
#include "xfs_sb.h"
#include "xfs_ag_resv.h"
/*
* Copy on Write of Shared Blocks
*
* XFS must preserve "the usual" file semantics even when two files share
* the same physical blocks. This means that a write to one file must not
* alter the blocks in a different file; the way that we'll do that is
* through the use of a copy-on-write mechanism. At a high level, that
* means that when we want to write to a shared block, we allocate a new
* block, write the data to the new block, and if that succeeds we map the
* new block into the file.
*
* XFS provides a "delayed allocation" mechanism that defers the allocation
* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
* possible. This reduces fragmentation by enabling the filesystem to ask
* for bigger chunks less often, which is exactly what we want for CoW.
*
* The delalloc mechanism begins when the kernel wants to make a block
* writable (write_begin or page_mkwrite). If the offset is not mapped, we
* create a delalloc mapping, which is a regular in-core extent, but without
* a real startblock. (For delalloc mappings, the startblock encodes both
* a flag that this is a delalloc mapping, and a worst-case estimate of how
* many blocks might be required to put the mapping into the BMBT.) delalloc
* mappings are a reservation against the free space in the filesystem;
* adjacent mappings can also be combined into fewer larger mappings.
*
* As an optimization, the CoW extent size hint (cowextsz) creates
* outsized aligned delalloc reservations in the hope of landing out of
* order nearby CoW writes in a single extent on disk, thereby reducing
* fragmentation and improving future performance.
*
* D: --RRRRRRSSSRRRRRRRR--- (data fork)
* C: ------DDDDDDD--------- (CoW fork)
*
* When dirty pages are being written out (typically in writepage), the
* delalloc reservations are converted into unwritten mappings by
* allocating blocks and replacing the delalloc mapping with real ones.
* A delalloc mapping can be replaced by several unwritten ones if the
* free space is fragmented.
*
* D: --RRRRRRSSSRRRRRRRR---
* C: ------UUUUUUU---------
*
* We want to adapt the delalloc mechanism for copy-on-write, since the
* write paths are similar. The first two steps (creating the reservation
* and allocating the blocks) are exactly the same as delalloc except that
* the mappings must be stored in a separate CoW fork because we do not want
* to disturb the mapping in the data fork until we're sure that the write
* succeeded. IO completion in this case is the process of removing the old
* mapping from the data fork and moving the new mapping from the CoW fork to
* the data fork. This will be discussed shortly.
*
* For now, unaligned directio writes will be bounced back to the page cache.
* Block-aligned directio writes will use the same mechanism as buf
|