summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-25 14:47:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-25 14:47:04 -0700
commita5b3d8660b049779880c790549ff3fef02f6922c (patch)
treedf07a0fd239a926a8713d22325497ac46bebd745 /drivers/hv
parentdce3ab4c57e662ae019c22e7c2f2aa887617beae (diff)
parent628cc040b3a2980df6032766e8ef0688e981ab95 (diff)
downloadlinux-a5b3d8660b049779880c790549ff3fef02f6922c.tar.gz
linux-a5b3d8660b049779880c790549ff3fef02f6922c.tar.bz2
linux-a5b3d8660b049779880c790549ff3fef02f6922c.zip
Merge tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv updates from Wei Liu: - Add support for running as the root partition in Hyper-V (Microsoft Hypervisor) by exposing /dev/mshv (Nuno and various people) - Add support for CPU offlining in Hyper-V (Hamza Mahfooz) - Misc fixes and cleanups (Roman Kisel, Tianyu Lan, Wei Liu, Michael Kelley, Thorsten Blum) * tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (24 commits) x86/hyperv: fix an indentation issue in mshyperv.h x86/hyperv: Add comments about hv_vpset and var size hypercall input args Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs hyperv: Add definitions for root partition driver to hv headers x86: hyperv: Add mshv_handler() irq handler and setup function Drivers: hv: Introduce per-cpu event ring tail Drivers: hv: Export some functions for use by root partition module acpi: numa: Export node_to_pxm() hyperv: Introduce hv_recommend_using_aeoi() arm64/hyperv: Add some missing functions to arm64 x86/mshyperv: Add support for extended Hyper-V features hyperv: Log hypercall status codes as strings x86/hyperv: Fix check of return value from snp_set_vmsa() x86/hyperv: Add VTL mode callback for restarting the system x86/hyperv: Add VTL mode emergency restart callback hyperv: Remove unused union and structs hyperv: Add CONFIG_MSHV_ROOT to gate root partition support hyperv: Change hv_root_partition into a function hyperv: Convert hypercall statuses to linux error codes drivers/hv: add CPU offlining support ...
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Kconfig17
-rw-r--r--drivers/hv/Makefile4
-rw-r--r--drivers/hv/hv.c94
-rw-r--r--drivers/hv/hv_common.c198
-rw-r--r--drivers/hv/hv_proc.c195
-rw-r--r--drivers/hv/mshv.h30
-rw-r--r--drivers/hv/mshv_common.c161
-rw-r--r--drivers/hv/mshv_eventfd.c833
-rw-r--r--drivers/hv/mshv_eventfd.h71
-rw-r--r--drivers/hv/mshv_irq.c124
-rw-r--r--drivers/hv/mshv_portid_table.c83
-rw-r--r--drivers/hv/mshv_root.h311
-rw-r--r--drivers/hv/mshv_root_hv_call.c849
-rw-r--r--drivers/hv/mshv_root_main.c2307
-rw-r--r--drivers/hv/mshv_synic.c665
-rw-r--r--drivers/hv/vmbus_drv.c54
16 files changed, 5927 insertions, 69 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 862c47b191af..6c1416167bd2 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -55,4 +55,21 @@ config HYPERV_BALLOON
help
Select this option to enable Hyper-V Balloon driver.
+config MSHV_ROOT
+ tristate "Microsoft Hyper-V root partition support"
+ depends on HYPERV && (X86_64 || ARM64)
+ depends on !HYPERV_VTL_MODE
+ # The hypervisor interface operates on 4k pages. Enforcing it here
+ # simplifies many assumptions in the root partition code.
+ # e.g. When withdrawing memory, the hypervisor gives back 4k pages in
+ # no particular order, making it impossible to reassemble larger pages
+ depends on PAGE_SIZE_4KB
+ select EVENTFD
+ default n
+ help
+ Select this option to enable support for booting and running as root
+ partition on Microsoft Hyper-V.
+
+ If unsure, say N.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index b992c0ed182b..976189c725dc 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_HYPERV) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
+obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@@ -11,6 +12,9 @@ hv_vmbus-y := vmbus_drv.o \
channel_mgmt.o ring_buffer.o hv_trace.o
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
+mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
+ mshv_root_hv_call.o mshv_portid_table.o
# Code that must be built-in
obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 36d9ba097ff5..308c8f279df8 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -144,7 +144,7 @@ int hv_synic_alloc(void)
* Synic message and event pages are allocated by paravisor.
* Skip these pages allocation here.
*/
- if (!ms_hyperv.paravisor_present && !hv_root_partition) {
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
hv_cpu->synic_message_page =
(void *)get_zeroed_page(GFP_ATOMIC);
if (!hv_cpu->synic_message_page) {
@@ -272,7 +272,7 @@ void hv_synic_enable_regs(unsigned int cpu)
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@@ -291,7 +291,7 @@ void hv_synic_enable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@@ -313,17 +313,7 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.vector = vmbus_interrupt;
shared_sint.masked = false;
-
- /*
- * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
- * it doesn't provide a recommendation flag and AEOI must be disabled.
- */
-#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
- shared_sint.auto_eoi =
- !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
-#else
- shared_sint.auto_eoi = 0;
-#endif
+ shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
/* Enable the global synic bit */
@@ -367,7 +357,7 @@ void hv_synic_disable_regs(unsigned int cpu)
* addresses.
*/
simp.simp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_message_page);
hv_cpu->synic_message_page = NULL;
} else {
@@ -379,7 +369,7 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_event_page);
hv_cpu->synic_event_page = NULL;
} else {
@@ -433,13 +423,47 @@ retry:
return pending;
}
+static int hv_pick_new_cpu(struct vmbus_channel *channel)
+{
+ int ret = -EBUSY;
+ int start;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+ /*
+ * We can't assume that the relevant interrupts will be sent before
+ * the cpu is offlined on older versions of hyperv.
+ */
+ if (vmbus_proto_version < VERSION_WIN10_V5_3)
+ return -EBUSY;
+
+ start = get_random_u32_below(nr_cpu_ids);
+
+ for_each_cpu_wrap(cpu, cpu_online_mask, start) {
+ if (channel->target_cpu == cpu ||
+ channel->target_cpu == VMBUS_CONNECT_CPU)
+ continue;
+
+ ret = vmbus_channel_set_cpu(channel, cpu);
+ if (!ret)
+ break;
+ }
+
+ if (ret)
+ ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
+
+ return ret;
+}
+
/*
* hv_synic_cleanup - Cleanup routine for hv_synic_init().
*/
int hv_synic_cleanup(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
- bool channel_found = false;
+ int ret = 0;
if (vmbus_connection.conn_state != CONNECTED)
goto always_cleanup;
@@ -456,38 +480,34 @@ int hv_synic_cleanup(unsigned int cpu)
/*
* Search for channels which are bound to the CPU we're about to
- * cleanup. In case we find one and vmbus is still connected, we
- * fail; this will effectively prevent CPU offlining.
- *
- * TODO: Re-bind the channels to different CPUs.
+ * cleanup.
*/
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (channel->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(channel);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(sc);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
}
- if (channel_found)
- break;
}
mutex_unlock(&vmbus_connection.channel_mutex);
- if (channel_found)
- return -EBUSY;
-
/*
- * channel_found == false means that any channels that were previously
- * assigned to the CPU have been reassigned elsewhere with a call of
- * vmbus_send_modifychannel(). Scan the event flags page looking for
- * bits that are set and waiting with a timeout for vmbus_chan_sched()
- * to process such bits. If bits are still set after this operation
- * and VMBus is connected, fail the CPU offlining operation.
+ * Scan the event flags page looking for bits that are set and waiting
+ * with a timeout for vmbus_chan_sched() to process such bits. If bits
+ * are still set after this operation and VMBus is connected, fail the
+ * CPU offlining operation.
*/
if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
return -EBUSY;
@@ -497,5 +517,5 @@ always_cleanup:
hv_synic_disable_regs(cpu);
- return 0;
+ return ret;
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index f2e6f55d6ca6..b3b11be11650 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -31,8 +31,14 @@
#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
+u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
+
+enum hv_partition_type hv_curr_partition_type;
+EXPORT_SYMBOL_GPL(hv_curr_partition_type);
+
/*
- * hv_root_partition, ms_hyperv and hv_nested are defined here with other
+ * ms_hyperv and hv_nested are defined here with other
* Hyper-V specific globals so they are shared across all architectures and are
* built only when CONFIG_HYPERV is defined. But on x86,
* ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
@@ -40,9 +46,6 @@
* here, allowing for an overriding definition in the module containing
* ms_hyperv_init_platform().
*/
-bool __weak hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
-
bool __weak hv_nested;
EXPORT_SYMBOL_GPL(hv_nested);
@@ -66,6 +69,16 @@ static void hv_kmsg_dump_unregister(void);
static struct ctl_table_header *hv_ctl_table_hdr;
/*
+ * Per-cpu array holding the tail pointer for the SynIC event ring buffer
+ * for each SINT.
+ *
+ * We cannot maintain this in mshv driver because the tail pointer should
+ * persist even if the mshv driver is unloaded.
+ */
+u8 * __percpu *hv_synic_eventring_tail;
+EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
+
+/*
* Hyper-V specific initialization and shutdown code that is
* common across all architectures. Called from architecture
* specific initialization functions.
@@ -87,6 +100,9 @@ void __init hv_common_free(void)
free_percpu(hyperv_pcpu_input_arg);
hyperv_pcpu_input_arg = NULL;
+
+ free_percpu(hv_synic_eventring_tail);
+ hv_synic_eventring_tail = NULL;
}
/*
@@ -280,7 +296,26 @@ static void hv_kmsg_dump_register(void)
static inline bool hv_output_page_exists(void)
{
- return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
+ return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
+}
+
+void __init hv_get_partition_id(void)
+{
+ struct hv_output_get_partition_id *output;
+ unsigned long flags;
+ u64 status, pt_id;
+
+ local_irq_save(flags);
+ output = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, &output);
+ pt_id = output->partition_id;
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ hv_current_partition_id = pt_id;
+ else
+ pr_err("Hyper-V: failed to get partition ID: %#x\n",
+ hv_result(status));
}
int __init hv_common_init(void)
@@ -350,6 +385,11 @@ int __init hv_common_init(void)
BUG_ON(!hyperv_pcpu_output_arg);
}
+ if (hv_root_partition()) {
+ hv_synic_eventring_tail = alloc_percpu(u8 *);
+ BUG_ON(!hv_synic_eventring_tail);
+ }
+
hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
GFP_KERNEL);
if (!hv_vp_index) {
@@ -438,11 +478,12 @@ error:
int hv_common_cpu_init(unsigned int cpu)
{
void **inputarg, **outputarg;
+ u8 **synic_eventring_tail;
u64 msr_vp_index;
gfp_t flags;
const int pgcount = hv_output_page_exists() ? 2 : 1;
void *mem;
- int ret;
+ int ret = 0;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -450,8 +491,8 @@ int hv_common_cpu_init(unsigned int cpu)
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/*
- * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
- * allocated if this CPU was previously online and then taken offline
+ * The per-cpu memory is already allocated if this CPU was previously
+ * online and then taken offline
*/
if (!*inputarg) {
mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
@@ -498,11 +539,21 @@ int hv_common_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
- return 0;
+ if (hv_root_partition()) {
+ synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+ *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT,
+ sizeof(u8), flags);
+ /* No need to unwind any of the above on failure here */
+ if (unlikely(!*synic_eventring_tail))
+ ret = -ENOMEM;
+ }
+
+ return ret;
}
int hv_common_cpu_die(unsigned int cpu)
{
+ u8 **synic_eventring_tail;
/*
* The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
* is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
@@ -515,6 +566,10 @@ int hv_common_cpu_die(unsigned int cpu)
* originally allocated memory is reused in hv_common_cpu_init().
*/
+ synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail);
+ kfree(*synic_eventring_tail);
+ *synic_eventring_tail = NULL;
+
return 0;
}
@@ -572,7 +627,7 @@ EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
bool hv_is_hibernation_supported(void)
{
- return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
+ return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
@@ -625,6 +680,11 @@ void __weak hv_remove_vmbus_handler(void)
}
EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
+void __weak hv_setup_mshv_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_mshv_handler);
+
void __weak hv_setup_kexec_handler(void (*handler)(void))
{
}
@@ -661,3 +721,121 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+
+void hv_identify_partition_type(void)
+{
+ /* Assume guest role */
+ hv_curr_partition_type = HV_PARTITION_TYPE_GUEST;
+ /*
+ * Check partition creation and cpu management privileges
+ *
+ * Hyper-V should never specify running as root and as a Confidential
+ * VM. But to protect against a compromised/malicious Hyper-V trying
+ * to exploit root behavior to expose Confidential VM memory, ignore
+ * the root partition setting if also a Confidential VM.
+ */
+ if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) &&
+ (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
+ !(ms_hyperv.priv_high & HV_ISOLATION)) {
+ pr_info("Hyper-V: running as root partition\n");
+ if (IS_ENABLED(CONFIG_MSHV_ROOT))
+ hv_curr_partition_type = HV_PARTITION_TYPE_ROOT;
+ else
+ pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n");
+ }
+}
+
+struct hv_status_info {
+ char *string;
+ int errno;
+ u16 code;
+};
+
+/*
+ * Note on the errno mappings:
+ * A failed hypercall is usually only recoverable (or loggable) near
+ * the call site where the HV_STATUS_* code is known. So the errno
+ * it gets converted to is not too useful further up the stack.
+ * Provide a few mappings that could be useful, and revert to -EIO
+ * as a fallback.
+ */
+static const struct hv_status_info hv_status_infos[] = {
+#define _STATUS_INFO(status, errno) { #status, (errno), (status) }
+ _STATUS_INFO(HV_STATUS_SUCCESS, 0),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL),
+ _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO),
+ _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO),
+ _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO),
+ _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO),
+ _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO),
+ _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO),
+ _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO),
+#undef _STATUS_INFO
+};
+
+static inline const struct hv_status_info *find_hv_status_info(u64 hv_status)
+{
+ int i;
+ u16 code = hv_result(hv_status);
+
+ for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) {
+ const struct hv_status_info *info = &hv_status_infos[i];
+
+ if (info->code == code)
+ return info;
+ }
+
+ return NULL;
+}
+
+/* Convert a hypercall result into a linux-friendly error code. */
+int hv_result_to_errno(u64 status)
+{
+ const struct hv_status_info *info;
+
+ /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */
+ if (unlikely(status == U64_MAX))
+ return -EOPNOTSUPP;
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->errno;
+
+ return -EIO;
+}
+EXPORT_SYMBOL_GPL(hv_result_to_errno);
+
+const char *hv_result_to_string(u64 status)
+{
+ const struct hv_status_info *info;
+
+ if (unlikely(status == U64_MAX))
+ return "Hypercall page missing!";
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->string;
+
+ return "Unknown";
+}
+EXPORT_SYMBOL_GPL(hv_result_to_string);
diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
new file mode 100644
index 000000000000..7d7ecb6f6137
--- /dev/null
+++ b/drivers/hv/hv_proc.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <asm/mshyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled. */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+ struct page **pages, *page;
+ int *counts;
+ int num_allocations;
+ int i, j, page_count;
+ int order;
+ u64 status;
+ int ret;
+ u64 base_pfn;
+ struct hv_deposit_memory *input_page;
+ unsigned long flags;
+
+ if (num_pages > HV_DEPOSIT_MAX)
+ return -E2BIG;
+ if (!num_pages)
+ return 0;
+
+ /* One buffer for page pointers and counts */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ pages = page_address(page);
+
+ counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+ if (!counts) {
+ free_page((unsigned long)pages);
+ return -ENOMEM;
+ }
+
+ /* Allocate all the pages before disabling interrupts */
+ i = 0;
+
+ while (num_pages) {
+ /* Find highest order we can actually allocate */
+ order = 31 - __builtin_clz(num_pages);
+
+ while (1) {
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (pages[i])
+ break;
+ if (!order) {
+ ret = -ENOMEM;
+ num_allocations = i;
+ goto err_free_allocations;
+ }
+ --order;
+ }
+
+ split_page(pages[i], order);
+ counts[i] = 1 << order;
+ num_pages -= counts[i];
+ i++;
+ }
+ num_allocations = i;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+
+ /* Populate gpa_page_list - these will fit on the input page */
+ for (i = 0, page_count = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j, ++page_count)
+ input_page->gpa_page_list[page_count] = base_pfn + j;
+ }
+ status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+ page_count, 0, input_page, NULL);
+ local_irq_restore(flags);
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ ret = hv_result_to_errno(status);
+ goto err_free_allocations;
+ }
+
+ ret = 0;
+ goto free_buf;
+
+err_free_allocations:
+ for (i = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j)
+ __free_page(pfn_to_page(base_pfn + j));
+ }
+
+free_buf:
+ free_page((unsigned long)pages);
+ kfree(counts);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+ struct hv_input_add_logical_processor *input;
+ struct hv_output_add_logical_processor *output;
+ u64 status;
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * When adding a logical processor, the hypervisor may return
+ * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+ * pages and retry.
+ */
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /* We don't do anything with the output right now */
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ input->apic_id = apic_id;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+ input, output);
+ local_irq_restore(flags);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "cpu %u apic ID: %u\n",
+ lp_index, apic_id);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+ struct hv_create_vp *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+
+ /* Root VPs don't seem to need pages deposited */
+ if (partition_id != hv_current_partition_id) {
+ /* The value 90 is empirically determined. It may change. */
+ ret = hv_call_deposit_pages(node, partition_id, 90);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->flags = flags;
+ input->subnode_type = HV_SUBNODE_ANY;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+ local_irq_restore(irq_flags);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "vcpu: %u, lp: %u\n",
+ vp_index, flags);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, partition_id, 1);
+
+ } while (!ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_create_vp);
diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h
new file mode 100644
index 000000000000..0340a67acd0a
--- /dev/null
+++ b/drivers/hv/mshv.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_H_
+#define _MSHV_H_
+
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <hyperv/hvhdk.h>
+
+#define mshv_field_nonzero(STRUCT, MEMBER) \
+ memchr_inv(&((STRUCT).MEMBER), \
+ 0, sizeof_field(typeof(STRUCT), MEMBER))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_get_partition_property(u64 partition_id, u64 property_code,
+ u64 *property_value);
+
+int mshv_do_pre_guest_mode_work(ulong th_flags);
+
+#endif /* _MSHV_H */
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
new file mode 100644
index 000000000000..2575e6d7a71f
--- /dev/null
+++ b/drivers/hv/mshv_common.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * This file contains functions that will be called from one or more modules.
+ * If any of these modules are configured to build, this file is built and just
+ * statically linked in.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/mshyperv.h>
+#include <linux/resume_user_mode.h>
+
+#include "mshv.h"
+
+#define HV_GET_REGISTER_BATCH_SIZE \
+ (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value))
+#define HV_SET_REGISTER_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \
+ / sizeof(struct hv_register_assoc))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_get_vp_registers *input_page;
+ union hv_register_value *output_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count, i;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE);
+ for (i = 0; i < rep_count; ++i)
+ input_page->names[i] = registers[i].name;
+
+ status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count,
+ 0, input_page, output_page);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ for (i = 0; i < completed; ++i)
+ registers[i].value = output_page[i];
+
+ registers += completed;
+ remaining -= completed;
+ }
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_get_vp_registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_set_vp_registers *input_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE);
+ memcpy(input_page->elements, registers,
+ sizeof(struct hv_register_assoc) * rep_count);
+
+ status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count,
+ 0, input_page, NULL);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ registers += completed;
+ remaining -= completed;
+ }
+
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_set_vp_registers);
+
+int hv_call_get_partition_property(u64 partition_id,
+ u64 property_code,
+ u64 *property_value)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property *input;
+ struct hv_output_get_partition_property *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ return hv_result_to_errno(status);
+ }
+ *property_value = output->property_value;
+
+ local_irq_restore(flags);
+
+ return 0;