// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec_handover.c - kexec handover metadata processing
* Copyright (C) 2023 Alexander Graf <graf@amazon.com>
* Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
* Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
*/
#define pr_fmt(fmt) "KHO: " fmt
#include <linux/cma.h>
#include <linux/count_zeros.h>
#include <linux/debugfs.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/page-isolation.h>
#include <linux/vmalloc.h>
#include <asm/early_ioremap.h>
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
*/
#include "../mm/internal.h"
#include "kexec_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1"
#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
#define PROP_SUB_FDT "fdt"
#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
/*
* KHO uses page->private, which is an unsigned long, to store page metadata.
* Use it to store both the magic and the order.
*/
union kho_page_info {
unsigned long page_private;
struct {
unsigned int order;
unsigned int magic;
};
};
static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
static bool kho_enable __ro_after_init;
bool kho_is_enabled(void)
{
return kho_enable;
}
EXPORT_SYMBOL_GPL(kho_is_enabled);
static int __init kho_parse_enable(char *p)
{
return kstrtobool(p, &kho_enable);
}
early_param("kho", kho_parse_enable);
/*
* Keep track of memory that is to be preserved across KHO.
*
* The serializing side uses two levels of xarrays to manage chunks of per-order
* 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
* 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
* each bitmap will cover 16M of address space. Thus, for 16G of memory at most
* 512K of bitmap memory will be needed for order 0.
*
* This approach is fully incremental, as the serialization progresses folios
* can continue be aggregated to the tracker. The final step, immediately prior
* to kexec would serialize the xarray information into a linked list for the
* successor kernel to parse.
*/
#define PRESERVE_BITS (512 * 8)
struct kho_mem_phys_bits {
DECLARE_BITMAP(preserve, PRESERVE_BITS);
};
struct kho_mem_phys {
/*
* Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
* to order.
*/
struct xarray phys_bits;
};
struct kho_mem_track {
/* Points to kho_mem_phys, each order gets its own bitmap tree */
struct xarray orders;
};
struct khoser_mem_chunk;
struct kho_serialization {
struct page *fdt;
struct list_head fdt_list;
struct dentry *sub_fdt_dir;
struct kho_mem_track track;
/* First chunk of serialized preserved memory map */
struct khoser_mem_chunk *preserved_mem_map;
};
struct kho_out {
struct blocking_notifier_head chain_head;
struct dentry *dir;
struct mutex lock; /* protects KHO FDT finalization */
struct kho_serialization ser;
bool finalized;
};
static struct kho_out kho_out = {
.chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
.lock = __MUTEX_INITIALIZER(kho_out.lock),
.ser = {
.fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
.track = {
.orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
},
},
.finalized = false,
};
static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
{
void *elm, *res;
elm = xa_load(xa, index);
if (elm)
return elm;
elm = kzalloc(sz, GFP_KERNEL);
if (!elm)
return ERR_PTR(-ENOMEM);
res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
res = ERR_PTR(xa_err(res));
if (res) {
kfree(elm);
return res;
}
return elm;
}
static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
unsigned long end_pfn)
{
struct kho_mem_phys_bits *bits;
struct kho_mem_phys *physxa;
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
const unsigned long pfn_high = pfn >> order;
physxa = xa_load(&track->orders, order);
if (!physxa)
continue;
bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
if (!bits)
continue;
clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
pfn += 1 << order;
}
}
static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
unsigned int order)
{
struct kho_mem_phys_bits *
|