hyperv-next for 6.15

-----BEGIN PGP SIGNATURE-----
 
 iQFHBAABCgAxFiEEIbPD0id6easf0xsudhRwX5BBoF4FAmfhlLATHHdlaS5saXVA
 a2VybmVsLm9yZwAKCRB2FHBfkEGgXgchCADOz33rSm4G4w4r0qT05dTDi/lZkEdK
 64dQq322XXP/C9FfR66d30243gsAmuM5a0SvzFHLXAOu6yqM270Xehd/Rud+Um2s
 lSVnc0Ux0AWBgksqFd0t577aN7zmJEukosEYO5lBNop+zOcadrm3S6Th/AoL2h/D
 yphPkhH13bsCK+Wll/eBOQLIhC9iA0konYbBLuEQ5MqvUbrzc6Rmb5gxsHHZKOqg
 vLjkrYR/d3s2gIpKxiFp0RwvzGyffZEHxvU/YF3hTenPMlTlnXWbyspBSTVmWggP
 13IFLzqxDdW9RgUnGB4xRc424AC1LKqEr42QPQE7zGvl2jdJriA2Q1LT
 =BXqj
 -----END PGP SIGNATURE-----

Merge tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv updates from Wei Liu:

 - Add support for running as the root partition in Hyper-V (Microsoft
   Hypervisor) by exposing /dev/mshv (Nuno and various people)

 - Add support for CPU offlining in Hyper-V (Hamza Mahfooz)

 - Misc fixes and cleanups (Roman Kisel, Tianyu Lan, Wei Liu, Michael
   Kelley, Thorsten Blum)

* tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (24 commits)
  x86/hyperv: fix an indentation issue in mshyperv.h
  x86/hyperv: Add comments about hv_vpset and var size hypercall input args
  Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs
  hyperv: Add definitions for root partition driver to hv headers
  x86: hyperv: Add mshv_handler() irq handler and setup function
  Drivers: hv: Introduce per-cpu event ring tail
  Drivers: hv: Export some functions for use by root partition module
  acpi: numa: Export node_to_pxm()
  hyperv: Introduce hv_recommend_using_aeoi()
  arm64/hyperv: Add some missing functions to arm64
  x86/mshyperv: Add support for extended Hyper-V features
  hyperv: Log hypercall status codes as strings
  x86/hyperv: Fix check of return value from snp_set_vmsa()
  x86/hyperv: Add VTL mode callback for restarting the system
  x86/hyperv: Add VTL mode emergency restart callback
  hyperv: Remove unused union and structs
  hyperv: Add CONFIG_MSHV_ROOT to gate root partition support
  hyperv: Change hv_root_partition into a function
  hyperv: Convert hypercall statuses to linux error codes
  drivers/hv: add CPU offlining support
  ...
This commit is contained in:
Linus Torvalds 2025-03-25 14:47:04 -07:00
commit a5b3d8660b
39 changed files with 6514 additions and 228 deletions

View File

@ -370,6 +370,8 @@ Code Seq# Include File Comments
0xB7 all uapi/linux/remoteproc_cdev.h <mailto:linux-remoteproc@vger.kernel.org>
0xB7 all uapi/linux/nsfs.h <mailto:Andrei Vagin <avagin@openvz.org>>
0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver
0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver
<mailto:linux-hyperv@vger.kernel.org>
0xC0 00-0F linux/usb/iowarrior.h
0xCA 00-0F uapi/misc/cxl.h
0xCA 10-2F uapi/misc/ocxl.h

View File

@ -53,6 +53,23 @@ u64 hv_do_fast_hypercall8(u16 code, u64 input)
}
EXPORT_SYMBOL_GPL(hv_do_fast_hypercall8);
/*
* hv_do_fast_hypercall16 -- Invoke the specified hypercall
* with arguments in registers instead of physical memory.
* Avoids the overhead of virt_to_phys for simple hypercalls.
*/
u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
{
struct arm_smccc_res res;
u64 control;
control = (u64)code | HV_HYPERCALL_FAST_BIT;
arm_smccc_1_1_hvc(HV_FUNC_ID, control, input1, input2, &res);
return res.a0;
}
EXPORT_SYMBOL_GPL(hv_do_fast_hypercall16);
/*
* Set a single VP register to a 64-bit value.
*/

View File

@ -26,6 +26,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static int __init hyperv_init(void)
{
@ -61,6 +62,8 @@ static int __init hyperv_init(void)
ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
ms_hyperv.misc_features);
hv_identify_partition_type();
ret = hv_common_init();
if (ret)
return ret;
@ -72,6 +75,9 @@ static int __init hyperv_init(void)
return ret;
}
if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
ms_hyperv_late_init();
hyperv_initialized = true;

View File

@ -40,6 +40,19 @@ static inline u64 hv_get_msr(unsigned int reg)
return hv_get_vpreg(reg);
}
/*
* Nested is not supported on arm64
*/
static inline void hv_set_non_nested_msr(unsigned int reg, u64 value)
{
hv_set_msr(reg, value);
}
static inline u64 hv_get_non_nested_msr(unsigned int reg)
{
return hv_get_msr(reg);
}
/* SMCCC hypercall parameters */
#define HV_SMCCC_FUNC_NUMBER 1
#define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
obj-$(CONFIG_X86_64) += hv_apic.o
obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
ifdef CONFIG_X86_64

View File

@ -145,6 +145,11 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
}
/*
* For this hypercall, Hyper-V treats the valid_bank_mask field
* of ipi_arg->vp_set as part of the fixed size input header.
* So the variable input header size is equal to nr_bank.
*/
status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);

View File

@ -34,9 +34,6 @@
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@ -93,7 +90,7 @@ static int hv_cpu_init(unsigned int cpu)
return 0;
hvp = &hv_vp_assist_page[cpu];
if (hv_root_partition) {
if (hv_root_partition()) {
/*
* For root partition we get the hypervisor provided VP assist
* page, instead of allocating a new page.
@ -245,7 +242,7 @@ static int hv_cpu_die(unsigned int cpu)
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
union hv_vp_assist_msr_contents msr = { 0 };
if (hv_root_partition) {
if (hv_root_partition()) {
/*
* For root partition the VP assist page is mapped to
* hypervisor provided page, and thus we unmap the
@ -320,7 +317,7 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
if (hv_root_partition)
if (hv_root_partition())
return -EPERM;
/*
@ -393,24 +390,6 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
static void __init hv_get_partition_id(void)
{
struct hv_get_partition_id *output_page;
u64 status;
unsigned long flags;
local_irq_save(flags);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
if (!hv_result_success(status)) {
/* No point in proceeding if this failed */
pr_err("Failed to get partition ID: %lld\n", status);
BUG();
}
hv_current_partition_id = output_page->partition_id;
local_irq_restore(flags);
}
#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
static u8 __init get_vtl(void)
{
@ -539,7 +518,7 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
if (hv_root_partition) {
if (hv_root_partition()) {
struct page *pg;
void *src;
@ -605,17 +584,15 @@ skip_hypercall_pg_init:
register_syscore_ops(&hv_syscore_ops);
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
#ifdef CONFIG_PCI_MSI
/*
* If we're running as root, we want to create our own PCI MSI domain.
* We can't set this in hv_pci_init because that would be too late.
*/
if (hv_root_partition)
if (hv_root_partition())
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif

View File

@ -12,6 +12,7 @@
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
#include <asm/reboot.h>
#include <../kernel/smpboot.h>
extern struct boot_params boot_params;
@ -22,6 +23,36 @@ static bool __init hv_vtl_msi_ext_dest_id(void)
return true;
}
/*
* The `native_machine_emergency_restart` function from `reboot.c` writes
* to the physical address 0x472 to indicate the type of reboot for the
* firmware. We cannot have that in VSM as the memory composition might
* be more generic, and such write effectively corrupts the memory thus
* making diagnostics harder at the very least.
*/
static void __noreturn hv_vtl_emergency_restart(void)
{
/*
* Cause a triple fault and the immediate reset. Here the code does not run
* on the top of any firmware, whereby cannot reach out to its services.
* The inifinite loop is for the improbable case that the triple fault does
* not work and have to preserve the state intact for debugging.
*/
for (;;) {
idt_invalidate();
__asm__ __volatile__("int3");
}
}
/*
* The only way to restart in the VTL mode is to triple fault as the kernel runs
* as firmware.
*/
static void __noreturn hv_vtl_restart(char __maybe_unused *cmd)
{
hv_vtl_emergency_restart();
}
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
@ -236,6 +267,9 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
int __init hv_vtl_early_init(void)
{
machine_ops.emergency_restart = hv_vtl_emergency_restart;
machine_ops.restart = hv_vtl_restart;
/*
* `boot_cpu_has` returns the runtime feature support,
* and here is the earliest it can be used.

View File

@ -64,7 +64,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
local_irq_restore(flags);
if (!hv_result_success(status))
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
hv_status_err(status, "\n");
return hv_result(status);
}
@ -224,7 +224,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
kfree(stored_entry);
if (status != HV_STATUS_SUCCESS) {
pr_debug("%s: failed to unmap, status %lld", __func__, status);
hv_status_debug(status, "failed to unmap\n");
return;
}
}
@ -273,7 +273,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
status = hv_unmap_msi_interrupt(dev, &old_entry);
if (status != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
hv_status_err(status, "\n");
}
static void hv_msi_free_irq(struct irq_domain *domain,

View File

@ -338,7 +338,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
vmsa->sev_features = sev_status >> 2;
ret = snp_set_vmsa(vmsa, true);
if (!ret) {
if (ret) {
pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
free_page((u64)vmsa);
return ret;

View File

@ -205,6 +205,10 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
/*
* We can flush not more than max_gvas with one hypercall. Flush the
* whole address space if we were asked to do more.
*
* For these hypercalls, Hyper-V treats the valid_bank_mask field
* of flush->hv_vp_set as part of the fixed size input header.
* So the variable input header size is equal to nr_bank.
*/
max_gvas =
(PAGE_SIZE - sizeof(*flush) - nr_bank *

View File

@ -43,8 +43,6 @@ extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
extern u64 hv_current_partition_id;
extern union hv_ghcb * __percpu *hv_ghcb_pg;
bool hv_isolation_type_snp(void);
@ -58,10 +56,6 @@ u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
#define HV_AP_SEGMENT_LIMIT 0xffffffff
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
/*
* If the hypercall involves no input or output parameters, the hypervisor
* ignores the corresponding GPA pointer.
@ -160,7 +154,7 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
: "cc", "edi", "esi");
}
#endif
return hv_status;
return hv_status;
}
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)

View File

@ -33,8 +33,6 @@
#include <asm/numa.h>
#include <asm/svm.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
/* Is Linux running on nested Microsoft Hypervisor */
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
@ -109,6 +107,7 @@ void hv_set_msr(unsigned int reg, u64 value)
}
EXPORT_SYMBOL_GPL(hv_set_msr);
static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@ -119,6 +118,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
if (mshv_handler)
mshv_handler();
if (vmbus_handler)
vmbus_handler();
@ -128,6 +130,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
void hv_setup_mshv_handler(void (*handler)(void))
{
mshv_handler = handler;
}
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@ -422,6 +429,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
@ -436,13 +444,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
ms_hyperv.features, ms_hyperv.priv_high,
ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@ -451,25 +461,7 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
* Check CPU management privilege.
*
* To mirror what Windows does we should extract CPU management
* features and use the ReservedIdentityBit to detect if Linux is the
* root partition. But that requires negotiating CPU management
* interface (a process to be finalized). For now, use the privilege
* flag as the indicator for running as root.
*
* Hyper-V should never specify running as root and as a Confidential
* VM. But to protect against a compromised/malicious Hyper-V trying
* to exploit root behavior to expose Confidential VM memory, ignore
* the root partition setting if also a Confidential VM.
*/
if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
!(ms_hyperv.priv_high & HV_ISOLATION)) {
hv_root_partition = true;
pr_info("Hyper-V: running as root partition\n");
}
hv_identify_partition_type();
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
@ -618,7 +610,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
if (hv_root_partition ||
if (hv_root_partition() ||
(!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif

View File

@ -51,6 +51,7 @@ int node_to_pxm(int node)
return PXM_INVAL;
return node_to_pxm_map[node];
}
EXPORT_SYMBOL_GPL(node_to_pxm);
static void __acpi_map_pxm_to_node(int pxm, int node)
{

View File

@ -582,7 +582,7 @@ static void __init hv_init_tsc_clocksource(void)
* mapped.
*/
tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC);
if (hv_root_partition)
if (hv_root_partition())
tsc_pfn = tsc_msr.pfn;
else
tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page));
@ -627,7 +627,7 @@ void __init hv_remap_tsc_clocksource(void)
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return;
if (!hv_root_partition) {
if (!hv_root_partition()) {
WARN(1, "%s: attempt to remap TSC page in guest partition\n",
__func__);
return;

View File

@ -55,4 +55,21 @@ config HYPERV_BALLOON
help
Select this option to enable Hyper-V Balloon driver.
config MSHV_ROOT
tristate "Microsoft Hyper-V root partition support"
depends on HYPERV && (X86_64 || ARM64)
depends on !HYPERV_VTL_MODE
# The hypervisor interface operates on 4k pages. Enforcing it here
# simplifies many assumptions in the root partition code.
# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
# no particular order, making it impossible to reassemble larger pages
depends on PAGE_SIZE_4KB
select EVENTFD
default n
help
Select this option to enable support for booting and running as root
partition on Microsoft Hyper-V.
If unsure, say N.
endmenu

View File

@ -2,6 +2,7 @@
obj-$(CONFIG_HYPERV) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@ -11,6 +12,9 @@ hv_vmbus-y := vmbus_drv.o \
channel_mgmt.o ring_buffer.o hv_trace.o
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
mshv_root_hv_call.o mshv_portid_table.o
# Code that must be built-in
obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o

View File

@ -144,7 +144,7 @@ int hv_synic_alloc(void)
* Synic message and event pages are allocated by paravisor.
* Skip these pages allocation here.
*/
if (!ms_hyperv.paravisor_present && !hv_root_partition) {
if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
hv_cpu->synic_message_page =
(void *)get_zeroed_page(GFP_ATOMIC);
if (!hv_cpu->synic_message_page) {
@ -272,7 +272,7 @@ void hv_synic_enable_regs(unsigned int cpu)
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
if (ms_hyperv.paravisor_present || hv_root_partition) {
if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@ -291,7 +291,7 @@ void hv_synic_enable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
if (ms_hyperv.paravisor_present || hv_root_partition) {
if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@ -313,17 +313,7 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.vector = vmbus_interrupt;
shared_sint.masked = false;
/*
* On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
* it doesn't provide a recommendation flag and AEOI must be disabled.
*/
#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
shared_sint.auto_eoi =
!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
#else
shared_sint.auto_eoi = 0;
#endif
shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
/* Enable the global synic bit */
@ -367,7 +357,7 @@ void hv_synic_disable_regs(unsigned int cpu)
* addresses.
*/
simp.simp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition) {
if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_message_page);
hv_cpu->synic_message_page = NULL;
} else {
@ -379,7 +369,7 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition) {
if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_event_page);
hv_cpu->synic_event_page = NULL;
} else {
@ -433,13 +423,47 @@ retry:
return pending;
}
static int hv_pick_new_cpu(struct vmbus_channel *channel)
{
int ret = -EBUSY;
int start;
int cpu;
lockdep_assert_cpus_held();
lockdep_assert_held(&vmbus_connection.channel_mutex);
/*
* We can't assume that the relevant interrupts will be sent before
* the cpu is offlined on older versions of hyperv.
*/
if (vmbus_proto_version < VERSION_WIN10_V5_3)
return -EBUSY;
start = get_random_u32_below(nr_cpu_ids);
for_each_cpu_wrap(cpu, cpu_online_mask, start) {
if (channel->target_cpu == cpu ||
channel->target_cpu == VMBUS_CONNECT_CPU)
continue;
ret = vmbus_channel_set_cpu(channel, cpu);
if (!ret)
break;
}
if (ret)
ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
return ret;
}
/*
* hv_synic_cleanup - Cleanup routine for hv_synic_init().
*/
int hv_synic_cleanup(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
bool channel_found = false;
int ret = 0;
if (vmbus_connection.conn_state != CONNECTED)
goto always_cleanup;
@ -456,38 +480,34 @@ int hv_synic_cleanup(unsigned int cpu)
/*
* Search for channels which are bound to the CPU we're about to
* cleanup. In case we find one and vmbus is still connected, we
* fail; this will effectively prevent CPU offlining.
*
* TODO: Re-bind the channels to different CPUs.
* cleanup.
*/
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (channel->target_cpu == cpu) {
channel_found = true;
break;
ret = hv_pick_new_cpu(channel);
if (ret) {
mutex_unlock(&vmbus_connection.channel_mutex);
return ret;
}
}
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu) {
channel_found = true;
break;
ret = hv_pick_new_cpu(sc);
if (ret) {
mutex_unlock(&vmbus_connection.channel_mutex);
return ret;
}
}
}
if (channel_found)
break;
}
mutex_unlock(&vmbus_connection.channel_mutex);
if (channel_found)
return -EBUSY;
/*
* channel_found == false means that any channels that were previously
* assigned to the CPU have been reassigned elsewhere with a call of
* vmbus_send_modifychannel(). Scan the event flags page looking for
* bits that are set and waiting with a timeout for vmbus_chan_sched()
* to process such bits. If bits are still set after this operation
* and VMBus is connected, fail the CPU offlining operation.
* Scan the event flags page looking for bits that are set and waiting
* with a timeout for vmbus_chan_sched() to process such bits. If bits
* are still set after this operation and VMBus is connected, fail the
* CPU offlining operation.
*/
if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
return -EBUSY;
@ -497,5 +517,5 @@ always_cleanup:
hv_synic_disable_regs(cpu);
return 0;
return ret;
}

View File

@ -31,8 +31,14 @@
#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
enum hv_partition_type hv_curr_partition_type;
EXPORT_SYMBOL_GPL(hv_curr_partition_type);
/*
* hv_root_partition, ms_hyperv and hv_nested are defined here with other
* ms_hyperv and hv_nested are defined here with other
* Hyper-V specific globals so they are shared across all architectures and are
* built only when CONFIG_HYPERV is defined. But on x86,
* ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
@ -40,9 +46,6 @@
* here, allowing for an overriding definition in the module containing
* ms_hyperv_init_platform().
*/
bool __weak hv_root_partition;
EXPORT_SYMBOL_GPL(hv_root_partition);
bool __weak hv_nested;
EXPORT_SYMBOL_GPL(hv_nested);
@ -65,6 +68,16 @@ static void hv_kmsg_dump_unregister(void);
static struct ctl_table_header *hv_ctl_table_hdr;
/*
* Per-cpu array holding the tail pointer for the SynIC event ring buffer
* for each SINT.
*
* We cannot maintain this in mshv driver because the tail pointer should
* persist even if the mshv driver is unloaded.
*/
u8 * __percpu *hv_synic_eventring_tail;
EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
/*
* Hyper-V specific initialization and shutdown code that is
* common across all architectures. Called from architecture
@ -87,6 +100,9 @@ void __init hv_common_free(void)
free_percpu(hyperv_pcpu_input_arg);
hyperv_pcpu_input_arg = NULL;
free_percpu(hv_synic_eventring_tail);
hv_synic_eventring_tail = NULL;
}
/*
@ -280,7 +296,26 @@ static void hv_kmsg_dump_register(void)
static inline bool hv_output_page_exists(void)
{
return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
}
void __init hv_get_partition_id(void)
{
struct hv_output_get_partition_id *output;
unsigned long flags;
u64 status, pt_id;
local_irq_save(flags);
output = *this_cpu_ptr(hyperv_pcpu_input_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, &output);
pt_id = output->partition_id;
local_irq_restore(flags);
if (hv_result_success(status))
hv_current_partition_id = pt_id;
else
pr_err("Hyper-V: failed to get partition ID: %#x\n",
hv_result(status));
}
int __init hv_common_init(void)
@ -350,6 +385,11 @@ int __init hv_common_init(void)
BUG_ON(!hyperv_pcpu_output_arg);
}
if (hv_root_partition()) {
hv_synic_eventring_tail = alloc_percpu(u8 *);
BUG_ON(!hv_synic_eventring_tail);
}
hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
GFP_KERNEL);
if (!hv_vp_index) {
@ -438,11 +478,12 @@ error:
int hv_common_cpu_init(unsigned int cpu)
{
void **inputarg, **outputarg;
u8 **synic_eventring_tail;
u64 msr_vp_index;
gfp_t flags;
const int pgcount = hv_output_page_exists() ? 2 : 1;
void *mem;
int ret;
int ret = 0;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@ -450,8 +491,8 @@ int hv_common_cpu_init(unsigned int cpu)
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/*
* hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
* allocated if this CPU was previously online and then taken offline
* The per-cpu memory is already allocated if this CPU was previously
* online and then taken offline
*/
if (!*inputarg) {
mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
@ -498,11 +539,21 @@ int hv_common_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
return 0;
if (hv_root_partition()) {
synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
*synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT,
sizeof(u8), flags);
/* No need to unwind any of the above on failure here */
if (unlikely(!*synic_eventring_tail))
ret = -ENOMEM;
}
return ret;
}
int hv_common_cpu_die(unsigned int cpu)
{
u8 **synic_eventring_tail;
/*
* The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
* is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
@ -515,6 +566,10 @@ int hv_common_cpu_die(unsigned int cpu)
* originally allocated memory is reused in hv_common_cpu_init().
*/
synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail);
kfree(*synic_eventring_tail);
*synic_eventring_tail = NULL;
return 0;
}
@ -572,7 +627,7 @@ EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
bool hv_is_hibernation_supported(void)
{
return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
@ -625,6 +680,11 @@ void __weak hv_remove_vmbus_handler(void)
}
EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
void __weak hv_setup_mshv_handler(void (*handler)(void))
{
}
EXPORT_SYMBOL_GPL(hv_setup_mshv_handler);
void __weak hv_setup_kexec_handler(void (*handler)(void))
{
}
@ -661,3 +721,121 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
void hv_identify_partition_type(void)
{
/* Assume guest role */
hv_curr_partition_type = HV_PARTITION_TYPE_GUEST;
/*
* Check partition creation and cpu management privileges
*
* Hyper-V should never specify running as root and as a Confidential
* VM. But to protect against a compromised/malicious Hyper-V trying
* to exploit root behavior to expose Confidential VM memory, ignore
* the root partition setting if also a Confidential VM.
*/
if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) &&
(ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
!(ms_hyperv.priv_high & HV_ISOLATION)) {
pr_info("Hyper-V: running as root partition\n");
if (IS_ENABLED(CONFIG_MSHV_ROOT))
hv_curr_partition_type = HV_PARTITION_TYPE_ROOT;
else
pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n");
}
}
struct hv_status_info {
char *string;
int errno;
u16 code;
};
/*
* Note on the errno mappings:
* A failed hypercall is usually only recoverable (or loggable) near
* the call site where the HV_STATUS_* code is known. So the errno
* it gets converted to is not too useful further up the stack.
* Provide a few mappings that could be useful, and revert to -EIO
* as a fallback.
*/
static const struct hv_status_info hv_status_infos[] = {
#define _STATUS_INFO(status, errno) { #status, (errno), (status) }
_STATUS_INFO(HV_STATUS_SUCCESS, 0),
_STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL),
_STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO),
_STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO),
_STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO),
_STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
_STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
_STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
_STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL),
_STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO),
_STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO),
_STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO),
_STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL),
_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO),
_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO),
_STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO),
_STATUS_INFO(HV_STATUS_TIME_OUT, -EIO),
_STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO),
_STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO),
#undef _STATUS_INFO
};
static inline const struct hv_status_info *find_hv_status_info(u64 hv_status)
{
int i;
u16 code = hv_result(hv_status);
for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) {
const struct hv_status_info *info = &hv_status_infos[i];
if (info->code == code)
return info;
}
return NULL;
}
/* Convert a hypercall result into a linux-friendly error code. */
int hv_result_to_errno(u64 status)
{
const struct hv_status_info *info;
/* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */
if (unlikely(status == U64_MAX))
return -EOPNOTSUPP;
info = find_hv_status_info(status);
if (info)
return info->errno;
return -EIO;
}
EXPORT_SYMBOL_GPL(hv_result_to_errno);
const char *hv_result_to_string(u64 status)
{
const struct hv_status_info *info;
if (unlikely(status == U64_MAX))
return "Hypercall page missing!";
info = find_hv_status_info(status);
if (info)
return info->string;
return "Unknown";
}
EXPORT_SYMBOL_GPL(hv_result_to_string);

View File

@ -6,11 +6,7 @@
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
#include <linux/minmax.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/apic.h>
#include <asm/trace/hyperv.h>
/*
* See struct hv_deposit_memory. The first u64 is partition ID, the rest
@ -91,8 +87,8 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
page_count, 0, input_page, NULL);
local_irq_restore(flags);
if (!hv_result_success(status)) {
pr_err("Failed to deposit pages: %lld\n", status);
ret = hv_result(status);
hv_status_err(status, "\n");
ret = hv_result_to_errno(status);
goto err_free_allocations;
}
@ -111,6 +107,7 @@ free_buf:
kfree(counts);
return ret;
}
EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
{
@ -118,7 +115,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
struct hv_output_add_logical_processor *output;
u64 status;
unsigned long flags;
int ret = HV_STATUS_SUCCESS;
int ret = 0;
/*
* When adding a logical processor, the hypervisor may return
@ -141,9 +138,9 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (!hv_result_success(status)) {
pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
lp_index, apic_id, status);
ret = hv_result(status);
hv_status_err(status, "cpu %u apic ID: %u\n",
lp_index, apic_id);
ret = hv_result_to_errno(status);
}
break;
}
@ -158,7 +155,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
struct hv_create_vp *input;
u64 status;
unsigned long irq_flags;
int ret = HV_STATUS_SUCCESS;
int ret = 0;
/* Root VPs don't seem to need pages deposited */
if (partition_id != hv_current_partition_id) {
@ -183,9 +180,9 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (!hv_result_success(status)) {
pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
vp_index, flags, status);
ret = hv_result(status);
hv_status_err(status, "vcpu: %u, lp: %u\n",
vp_index, flags);
ret = hv_result_to_errno(status);
}
break;
}
@ -195,4 +192,4 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
return ret;
}
EXPORT_SYMBOL_GPL(hv_call_create_vp);

30
drivers/hv/mshv.h Normal file
View File

@ -0,0 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2023, Microsoft Corporation.
*/
#ifndef _MSHV_H_
#define _MSHV_H_
#include <linux/stddef.h>
#include <linux/string.h>
#include <hyperv/hvhdk.h>
#define mshv_field_nonzero(STRUCT, MEMBER) \
memchr_inv(&((STRUCT).MEMBER), \
0, sizeof_field(typeof(STRUCT), MEMBER))
int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
union hv_input_vtl input_vtl,
struct hv_register_assoc *registers);
int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
union hv_input_vtl input_vtl,
struct hv_register_assoc *registers);
int hv_call_get_partition_property(u64 partition_id, u64 property_code,
u64 *property_value);
int mshv_do_pre_guest_mode_work(ulong th_flags);
#endif /* _MSHV_H */

161
drivers/hv/mshv_common.c Normal file
View File

@ -0,0 +1,161 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, Microsoft Corporation.
*
* This file contains functions that will be called from one or more modules.
* If any of these modules are configured to build, this file is built and just
* statically linked in.
*
* Authors: Microsoft Linux virtualization team
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/mshyperv.h>
#include <linux/resume_user_mode.h>
#include "mshv.h"
#define HV_GET_REGISTER_BATCH_SIZE \
(HV_HYP_PAGE_SIZE / sizeof(union hv_register_value))
#define HV_SET_REGISTER_BATCH_SIZE \
((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \
/ sizeof(struct hv_register_assoc))
int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
union hv_input_vtl input_vtl,
struct hv_register_assoc *registers)
{
struct hv_input_get_vp_registers *input_page;
union hv_register_value *output_page;
u16 completed = 0;
unsigned long remaining = count;
int rep_count, i;
u64 status = HV_STATUS_SUCCESS;
unsigned long flags;
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
input_page->partition_id = partition_id;
input_page->vp_index = vp_index;
input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
input_page->rsvd_z8 = 0;
input_page->rsvd_z16 = 0;
while (remaining) {
rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE);
for (i = 0; i < rep_count; ++i)
input_page->names[i] = registers[i].name;
status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count,
0, input_page, output_page);
if (!hv_result_success(status))
break;
completed = hv_repcomp(status);
for (i = 0; i < completed; ++i)
registers[i].value = output_page[i];
registers += completed;
remaining -= completed;
}
local_irq_restore(flags);
return hv_result_to_errno(status);
}
EXPORT_SYMBOL_GPL(hv_call_get_vp_registers);
int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
union hv_input_vtl input_vtl,
struct hv_register_assoc *registers)
{
struct hv_input_set_vp_registers *input_page;
u16 completed = 0;
unsigned long remaining = count;
int rep_count;
u64 status = HV_STATUS_SUCCESS;
unsigned long flags;
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
input_page->partition_id = partition_id;
input_page->vp_index = vp_index;
input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
input_page->rsvd_z8 = 0;
input_page->rsvd_z16 = 0;
while (remaining) {
rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE);
memcpy(input_page->elements, registers,
sizeof(struct hv_register_assoc) * rep_count);
status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count,
0, input_page, NULL);
if (!hv_result_success(status))
break;
completed = hv_repcomp(status);
registers += completed;
remaining -= completed;
}
local_irq_restore(flags);
return hv_result_to_errno(status);
}
EXPORT_SYMBOL_GPL(hv_call_set_vp_registers);
int hv_call_get_partition_property(u64 partition_id,
u64 property_code,
u64 *property_value)
{
u64 status;
unsigned long flags;
struct hv_input_get_partition_property *input;
struct hv_output_get_partition_property *output;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->property_code = property_code;
status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output);
if (!hv_result_success(status)) {
local_irq_restore(flags);
return hv_result_to_errno(status);
}
*property_value = output->property_value;
local_irq_restore(flags);
return 0;
}
EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
/*
* Handle any pre-processing before going into the guest mode on this cpu, most
* notably call schedule(). Must be invoked with both preemption and
* interrupts enabled.
*
* Returns: 0 on success, -errno on error.
*/
int mshv_do_pre_guest_mode_work(ulong th_flags)
{
if (th_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
return -EINTR;
if (th_flags & _TIF_NEED_RESCHED)
schedule();
if (th_flags & _TIF_NOTIFY_RESUME)
resume_user_mode_work(NULL);
return 0;
}
EXPORT_SYMBOL_GPL(mshv_do_pre_guest_mode_work);

833
drivers/hv/mshv_eventfd.c Normal file
View File

@ -0,0 +1,833 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* eventfd support for mshv
*
* Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
* framework code is taken from the kvm implementation.
*
* All credits to kvm developers.
*/
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/eventfd.h>
#if IS_ENABLED(CONFIG_X86_64)
#include <asm/apic.h>
#endif
#include <asm/mshyperv.h>
#include "mshv_eventfd.h"
#include "mshv.h"
#include "mshv_root.h"
static struct workqueue_struct *irqfd_cleanup_wq;
void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
struct mshv_irq_ack_notifier *mian)
{
mutex_lock(&partition->pt_irq_lock);
hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
mutex_unlock(&partition->pt_irq_lock);
}
void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
struct mshv_irq_ack_notifier *mian)
{
mutex_lock(&partition->pt_irq_lock);
hlist_del_init_rcu(&mian->link);
mutex_unlock(&partition->pt_irq_lock);
synchronize_rcu();
}
bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
{
struct mshv_irq_ack_notifier *mian;
bool acked = false;
rcu_read_lock();
hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
link) {
if (mian->irq_ack_gsi == gsi) {
mian->irq_acked(mian);
acked = true;
}
}
rcu_read_unlock();
return acked;
}
#if IS_ENABLED(CONFIG_ARM64)
static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
{
return false;
}
#elif IS_ENABLED(CONFIG_X86_64)
static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
{
return type == HV_X64_INTERRUPT_TYPE_EXTINT;
}
#endif
static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
{
struct mshv_irqfd_resampler *resampler;
struct mshv_partition *partition;
struct mshv_irqfd *irqfd;
int idx;
resampler = container_of(mian, struct mshv_irqfd_resampler,
rsmplr_notifier);
partition = resampler->rsmplr_partn;
idx = srcu_read_lock(&partition->pt_irq_srcu);
hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
irqfd_resampler_hnode) {
if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
hv_call_clear_virtual_interrupt(partition->pt_id);
eventfd_signal(irqfd->irqfd_resamplefd);
}
srcu_read_unlock(&partition->pt_irq_srcu, idx);
}
#if IS_ENABLED(CONFIG_X86_64)
static bool
mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
u32 vector)
{
int i;
for (i = 0; i < iv.vector_count; i++) {
if (iv.vector[i] == vector)
return true;
}
return false;
}
static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
{
union hv_vp_register_page_interrupt_vectors iv, new_iv;
iv = vp->vp_register_page->interrupt_vectors;
new_iv = iv;
if (mshv_vp_irq_vector_injected(iv, vector))
return 0;
if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
return -ENOSPC;
new_iv.vector[new_iv.vector_count++] = vector;
if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
iv.as_uint64, new_iv.as_uint64) != iv.as_uint64)
return -EAGAIN;
return 0;
}
static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
{
int ret;
do {
ret = mshv_vp_irq_try_set_vector(vp, vector);
} while (ret == -EAGAIN && !need_resched());
return ret;
}
/*
* Try to raise irq for guest via shared vector array. hyp does the actual
* inject of the interrupt.
*/
static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
{
struct mshv_partition *partition = irqfd->irqfd_partn;
struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
struct mshv_vp *vp;
if (!(ms_hyperv.ext_features &
HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
return -EOPNOTSUPP;
if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
return -EOPNOTSUPP;
if (irq->lapic_control.logical_dest_mode)
return -EOPNOTSUPP;
vp = partition->pt_vp_array[irq->lapic_apic_id];
if (!vp->vp_register_page)
return -EOPNOTSUPP;
if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
return -EINVAL;
if (vp->run.flags.root_sched_dispatched &&
vp->vp_register_page->interrupt_vectors.as_uint64)
return -EBUSY;
wake_up(&vp->run.vp_suspend_queue);
return 0;
}
#else /* CONFIG_X86_64 */
static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
{
return -EOPNOTSUPP;
}
#endif
static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
{
struct mshv_partition *partition = irqfd->irqfd_partn;
struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
unsigned int seq;
int idx;
WARN_ON(irqfd->irqfd_resampler &&
!irq->lapic_control.level_triggered);
idx = srcu_read_lock(&partition->pt_irq_srcu);
if (irqfd->irqfd_girq_ent.guest_irq_num) {
if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
srcu_read_unlock(&partition->pt_irq_srcu, idx);
return;
}
do {
seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
}
hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
irq->lapic_vector, irq->lapic_apic_id,
irq->lapic_control);
srcu_read_unlock(&partition->pt_irq_srcu, idx);
}
static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
{
struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
struct mshv_partition *pt = rp->rsmplr_partn;
mutex_lock(&pt->irqfds_resampler_lock);
hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
synchronize_srcu(&pt->pt_irq_srcu);
if (hlist_empty(&rp->rsmplr_irqfd_list)) {
hlist_del(&rp->rsmplr_hnode);
mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
kfree(rp);
}
mutex_unlock(&pt->irqfds_resampler_lock);
}
/*
* Race-free decouple logic (ordering is critical)
*/
static void mshv_irqfd_shutdown(struct work_struct *work)
{
struct mshv_irqfd *irqfd =
container_of(work, struct mshv_irqfd, irqfd_shutdown);
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
* further events.
*/
remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait);
if (irqfd->irqfd_resampler) {
mshv_irqfd_resampler_shutdown(irqfd);
eventfd_ctx_put(irqfd->irqfd_resamplefd);
}
/*
* It is now safe to release the object's resources
*/
eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
kfree(irqfd);
}
/* assumes partition->pt_irqfds_lock is held */
static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
{
return !hlist_unhashed(&irqfd->irqfd_hnode);
}
/*
* Mark the irqfd as inactive and schedule it for removal
*
* assumes partition->pt_irqfds_lock is held
*/
static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
{
if (!mshv_irqfd_is_active(irqfd))
return;
hlist_del(&irqfd->irqfd_hnode);
queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
}
/*
* Called with wqh->lock held and interrupts disabled
*/
static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *key)
{
struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
irqfd_wait);
unsigned long flags = (unsigned long)key;
int idx;
unsigned int seq;
struct mshv_partition *pt = irqfd->irqfd_partn;
int ret = 0;
if (flags & POLLIN) {
u64 cnt;
eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
idx = srcu_read_lock(&pt->pt_irq_srcu);
do {
seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
/* An event has been signaled, raise an interrupt */
ret = mshv_try_assert_irq_fast(irqfd);
if (ret)
mshv_assert_irq_slow(irqfd);
srcu_read_unlock(&pt->pt_irq_srcu, idx);
ret = 1;
}
if (flags & POLLHUP) {
/* The eventfd is closing, detach from the partition */
unsigned long flags;
spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
/*
* We must check if someone deactivated the irqfd before
* we could acquire the pt_irqfds_lock since the item is
* deactivated from the mshv side before it is unhooked from
* the wait-queue. If it is already deactivated, we can
* simply return knowing the other side will cleanup for us.
* We cannot race against the irqfd going away since the
* other side is required to acquire wqh->lock, which we hold
*/
if (mshv_irqfd_is_active(irqfd))
mshv_irqfd_deactivate(irqfd);
spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
}
return ret;
}
/* Must be called under pt_irqfds_lock */
static void mshv_irqfd_update(struct mshv_partition *pt,
struct mshv_irqfd *irqfd)
{
write_seqcount_begin(&irqfd->irqfd_irqe_sc);
irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
irqfd->irqfd_irqnum);
mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
write_seqcount_end(&irqfd->irqfd_irqe_sc);
}
void mshv_irqfd_routing_update(struct mshv_partition *pt)
{
struct mshv_irqfd *irqfd;
spin_lock_irq(&pt->pt_irqfds_lock);
hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
mshv_irqfd_update(pt, irqfd);
spin_unlock_irq(&pt->pt_irqfds_lock);
}
static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *polltbl)
{
struct mshv_irqfd *irqfd =
container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
irqfd->irqfd_wqh = wqh;
add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
}
static int mshv_irqfd_assign(struct mshv_partition *pt,
struct mshv_user_irqfd *args)
{
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
struct mshv_irqfd *irqfd, *tmp;
unsigned int events;
struct fd f;
int ret;
int idx;
irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
if (!irqfd)
return -ENOMEM;
irqfd->irqfd_partn = pt;
irqfd->irqfd_irqnum = args->gsi;
INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
f = fdget(args->fd);
if (!fd_file(f)) {
ret = -EBADF;
goto out;
}
eventfd = eventfd_ctx_fileget(fd_file(f));
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
goto fail;
}
irqfd->irqfd_eventfd_ctx = eventfd;
if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
struct mshv_irqfd_resampler *rp;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
ret = PTR_ERR(resamplefd);
goto fail;
}
irqfd->irqfd_resamplefd = resamplefd;
mutex_lock(&pt->irqfds_resampler_lock);
hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
rsmplr_hnode) {
if (rp->rsmplr_notifier.irq_ack_gsi ==
irqfd->irqfd_irqnum) {
irqfd->irqfd_resampler = rp;
break;
}
}
if (!irqfd->irqfd_resampler) {
rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT);
if (!rp) {
ret = -ENOMEM;
mutex_unlock(&pt->irqfds_resampler_lock);
goto fail;
}
rp->rsmplr_partn = pt;
INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
rp->rsmplr_notifier.irq_acked =
mshv_irqfd_resampler_ack;
hlist_add_head(&rp->rsmplr_hnode,
&pt->irqfds_resampler_list);
mshv_register_irq_ack_notifier(pt,
&rp->rsmplr_notifier);
irqfd->irqfd_resampler = rp;
}
hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
&irqfd->irqfd_resampler->rsmplr_irqfd_list);
mutex_unlock(&pt->irqfds_resampler_lock);
}
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
*/
init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
spin_lock_irq(&pt->pt_irqfds_lock);
if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
!irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
/*
* Resample Fd must be for level triggered interrupt
* Otherwise return with failure
*/
spin_unlock_irq(&pt->pt_irqfds_lock);
ret = -EINVAL;
goto fail;
}
ret = 0;
hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
continue;
/* This fd is used for another irq already. */
ret = -EBUSY;
spin_unlock_irq(&pt->pt_irqfds_lock);
goto fail;
}
idx = srcu_read_lock(&pt->pt_irq_srcu);
mshv_irqfd_update(pt, irqfd);
hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
spin_unlock_irq(&pt->pt_irqfds_lock);
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
if (events & POLLIN)
mshv_assert_irq_slow(irqfd);
srcu_read_unlock(&pt->pt_irq_srcu, idx);
/*
* do not drop the file until the irqfd is fully initialized, otherwise
* we might race against the POLLHUP
*/
fdput(f);
return 0;
fail:
if (irqfd->irqfd_resampler)
mshv_irqfd_resampler_shutdown(irqfd);
if (resamplefd && !IS_ERR(resamplefd))
eventfd_ctx_put(resamplefd);
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
fdput(f);
out:
kfree(irqfd);
return ret;
}
/*
* shutdown any irqfd's that match fd+gsi
*/
static int mshv_irqfd_deassign(struct mshv_partition *pt,
struct mshv_user_irqfd *args)
{
struct mshv_irqfd *irqfd;
struct hlist_node *n;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
irqfd_hnode) {
if (irqfd->irqfd_eventfd_ctx == eventfd &&
irqfd->irqfd_irqnum == args->gsi)
mshv_irqfd_deactivate(irqfd);
}
eventfd_ctx_put(eventfd);
/*
* Block until we know all outstanding shutdown jobs have completed
* so that we guarantee there will not be any more interrupts on this
* gsi once this deassign function returns.
*/
flush_workqueue(irqfd_cleanup_wq);
return 0;
}
int mshv_set_unset_irqfd(struct mshv_partition *pt,
struct mshv_user_irqfd *args)
{
if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
return -EINVAL;
if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
return mshv_irqfd_deassign(pt, args);
return mshv_irqfd_assign(pt, args);
}
/*
* This function is called as the mshv VM fd is being released.
* Shutdown all irqfds that still remain open
*/
static void mshv_irqfd_release(struct mshv_partition *pt)
{
struct mshv_irqfd *irqfd;
struct hlist_node *n;
spin_lock_irq(&pt->pt_irqfds_lock);
hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
mshv_irqfd_deactivate(irqfd);
spin_unlock_irq(&pt->pt_irqfds_lock);
/*
* Block until we know all outstanding shutdown jobs have completed
* since we do not take a mshv_partition* reference.
*/
flush_workqueue(irqfd_cleanup_wq);
}
int mshv_irqfd_wq_init(void)
{
irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
if (!irqfd_cleanup_wq)
return -ENOMEM;
return 0;
}
void mshv_irqfd_wq_cleanup(void)
{
destroy_workqueue(irqfd_cleanup_wq);
}
/*
* --------------------------------------------------------------------
* ioeventfd: translate a MMIO memory write to an eventfd signal.
*
* userspace can register a MMIO address with an eventfd for receiving
* notification when the memory has been touched.
* --------------------------------------------------------------------
*/
static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
{
if (p->iovntfd_doorbell_id > 0)
mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
eventfd_ctx_put(p->iovntfd_eventfd);
kfree(p);
}
/* MMIO writes trigger an event if the addr/val match */
static void ioeventfd_mmio_write(int doorbell_id, void *data)
{
struct mshv_partition *partition = (struct mshv_partition *)data;
struct mshv_ioeventfd *p;
rcu_read_lock();
hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
if (p->iovntfd_doorbell_id == doorbell_id) {
eventfd_signal(p->iovntfd_eventfd);
break;
}
rcu_read_unlock();
}
static bool ioeventfd_check_collision(struct mshv_partition *pt,
struct mshv_ioeventfd *p)
__must_hold(&pt->mutex)
{
struct mshv_ioeventfd *_p;
hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
if (_p->iovntfd_addr == p->iovntfd_addr &&
_p->iovntfd_length == p->iovntfd_length &&
(_p->iovntfd_wildcard || p->iovntfd_wildcard ||
_p->iovntfd_datamatch == p->iovntfd_datamatch))
return true;
return false;
}
static int mshv_assign_ioeventfd(struct mshv_partition *pt,
struct mshv_user_ioeventfd *args)
__must_hold(&pt->mutex)
{
struct mshv_ioeventfd *p;
struct eventfd_ctx *eventfd;
u64 doorbell_flags = 0;
int ret;
/* This mutex is currently protecting ioeventfd.items list */
WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
return -EOPNOTSUPP;
/* must be natural-word sized */
switch (args->len) {
case 0:
doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
break;
case 1:
doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
break;
case 2:
doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
break;
case 4:
doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
break;
case 8:
doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
break;
default:
return -EINVAL;
}
/* check for range overflow */
if (args->addr + args->len < args->addr)
return -EINVAL;
/* check for extra flags that we don't understand */
if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
return -EINVAL;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
goto fail;
}
p->iovntfd_addr = args->addr;
p->iovntfd_length = args->len;
p->iovntfd_eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
p->iovntfd_datamatch = args->datamatch;
} else {
p->iovntfd_wildcard = true;
doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
}
if (ioeventfd_check_collision(pt, p)) {
ret = -EEXIST;
goto unlock_fail;
}
ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
(void *)pt, p->iovntfd_addr,
p->iovntfd_datamatch, doorbell_flags);
if (ret < 0)
goto unlock_fail;
p->iovntfd_doorbell_id = ret;
hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
return 0;
unlock_fail:
kfree(p);
fail:
eventfd_ctx_put(eventfd);
return ret;
}
static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
struct mshv_user_ioeventfd *args)
__must_hold(&pt->mutex)
{
struct mshv_ioeventfd *p;
struct eventfd_ctx *eventfd;
struct hlist_node *n;
int ret = -ENOENT;
/* This mutex is currently protecting ioeventfd.items list */
WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
if (p->iovntfd_eventfd != eventfd ||
p->iovntfd_addr != args->addr ||
p->iovntfd_length != args->len ||
p->iovntfd_wildcard != wildcard)
continue;
if (!p->iovntfd_wildcard &&
p->iovntfd_datamatch != args->datamatch)
continue;
hlist_del_rcu(&p->iovntfd_hnode);
synchronize_rcu();
ioeventfd_release(p, pt->pt_id);
ret = 0;
break;
}
eventfd_ctx_put(eventfd);
return ret;
}
int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
struct mshv_user_ioeventfd *args)
__must_hold(&pt->mutex)
{
if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
mshv_field_nonzero(*args, rsvd))
return -EINVAL;
/* PIO not yet implemented */
if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
return -EOPNOTSUPP;
if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
return mshv_deassign_ioeventfd(pt, args);
return mshv_assign_ioeventfd(pt, args);
}
void mshv_eventfd_init(struct mshv_partition *pt)
{
spin_lock_init(&pt->pt_irqfds_lock);
INIT_HLIST_HEAD(&pt->pt_irqfds_list);
INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
mutex_init(&pt->irqfds_resampler_lock);
INIT_HLIST_HEAD(&pt->ioeventfds_list);
}
void mshv_eventfd_release(struct mshv_partition *pt)
{
struct hlist_head items;
struct hlist_node *n;
struct mshv_ioeventfd *p;
hlist_move_list(&pt->ioeventfds_list, &items);
synchronize_rcu();
hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
hlist_del(&p->iovntfd_hnode);
ioeventfd_release(p, pt->pt_id);
}
mshv_irqfd_release(pt);
}

71
drivers/hv/mshv_eventfd.h Normal file
View File

@ -0,0 +1,71 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* irqfd: Allows an fd to be used to inject an interrupt to the guest.
* ioeventfd: Allow an fd to be used to receive a signal from the guest.
* All credit goes to kvm developers.
*/
#ifndef __LINUX_MSHV_EVENTFD_H
#define __LINUX_MSHV_EVENTFD_H
#include <linux/poll.h>
#include "mshv.h"
#include "mshv_root.h"
/* struct to contain list of irqfds sharing an irq. Updates are protected by
* partition.irqfds.resampler_lock
*/
struct mshv_irqfd_resampler {
struct mshv_partition *rsmplr_partn;
struct hlist_head rsmplr_irqfd_list;
struct mshv_irq_ack_notifier rsmplr_notifier;
struct hlist_node rsmplr_hnode;
};
struct mshv_irqfd {
struct mshv_partition *irqfd_partn;
struct eventfd_ctx *irqfd_eventfd_ctx;
struct mshv_guest_irq_ent irqfd_girq_ent;
seqcount_spinlock_t irqfd_irqe_sc;
u32 irqfd_irqnum;
struct mshv_lapic_irq irqfd_lapic_irq;
struct hlist_node irqfd_hnode;
poll_table irqfd_polltbl;
wait_queue_head_t *irqfd_wqh;
wait_queue_entry_t irqfd_wait;
struct work_struct irqfd_shutdown;
struct mshv_irqfd_resampler *irqfd_resampler;
struct eventfd_ctx *irqfd_resamplefd;
struct hlist_node irqfd_resampler_hnode;
};
void mshv_eventfd_init(struct mshv_partition *partition);
void mshv_eventfd_release(struct mshv_partition *partition);
void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
struct mshv_irq_ack_notifier *mian);
void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
struct mshv_irq_ack_notifier *mian);
bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi);
int mshv_set_unset_irqfd(struct mshv_partition *partition,
struct mshv_user_irqfd *args);
int mshv_irqfd_wq_init(void);
void mshv_irqfd_wq_cleanup(void);
struct mshv_ioeventfd {
struct hlist_node iovntfd_hnode;
u64 iovntfd_addr;
int iovntfd_length;
struct eventfd_ctx *iovntfd_eventfd;
u64 iovntfd_datamatch;
int iovntfd_doorbell_id;
bool iovntfd_wildcard;
};
int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
struct mshv_user_ioeventfd *args);
#endif /* __LINUX_MSHV_EVENTFD_H */

124
drivers/hv/mshv_irq.c Normal file
View File

@ -0,0 +1,124 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* Authors: Microsoft Linux virtualization team
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/mshyperv.h>
#include "mshv_eventfd.h"
#include "mshv.h"
#include "mshv_root.h"
/* called from the ioctl code, user wants to update the guest irq table */
int mshv_update_routing_table(struct mshv_partition *partition,
const struct mshv_user_irq_entry *ue,
unsigned int numents)
{
struct mshv_girq_routing_table *new = NULL, *old;
u32 i, nr_rt_entries = 0;
int r = 0;
if (numents == 0)
goto swap_routes;
for (i = 0; i < numents; i++) {
if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS)
return -EINVAL;
if (ue[i].address_hi)
return -EINVAL;
nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
}
nr_rt_entries += 1;
new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries),
GFP_KERNEL_ACCOUNT);
if (!new)
return -ENOMEM;
new->num_rt_entries = nr_rt_entries;
for (i = 0; i < numents; i++) {
struct mshv_guest_irq_ent *girq;
girq = &new->mshv_girq_info_tbl[ue[i].gsi];
/*
* Allow only one to one mapping between GSI and MSI routing.
*/
if (girq->guest_irq_num != 0) {
r = -EINVAL;
goto out;
}
girq->guest_irq_num = ue[i].gsi;
girq->girq_addr_lo = ue[i].address_lo;
girq->girq_addr_hi = ue[i].address_hi;
girq->girq_irq_data = ue[i].data;
girq->girq_entry_valid = true;
}
swap_routes:
mutex_lock(&partition->pt_irq_lock);
old = rcu_dereference_protected(partition->pt_girq_tbl, 1);
rcu_assign_pointer(partition->pt_girq_tbl, new);
mshv_irqfd_routing_update(partition);
mutex_unlock(&partition->pt_irq_lock);
synchronize_srcu_expedited(&partition->pt_irq_srcu);
new = old;
out:
kfree(new);
return r;
}
/* vm is going away, kfree the irq routing table */
void mshv_free_routing_table(struct mshv_partition *partition)
{
struct mshv_girq_routing_table *rt =
rcu_access_pointer(partition->pt_girq_tbl);
kfree(rt);
}
struct mshv_guest_irq_ent
mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum)
{
struct mshv_guest_irq_ent entry = { 0 };
struct mshv_girq_routing_table *girq_tbl;
girq_tbl = srcu_dereference_check(partition->pt_girq_tbl,
&partition->pt_irq_srcu,
lockdep_is_held(&partition->pt_irq_lock));
if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) {
/*
* Premature register_irqfd, setting valid_entry = 0
* would ignore this entry anyway
*/
entry.guest_irq_num = irqnum;
return entry;
}
return girq_tbl->mshv_girq_info_tbl[irqnum];
}
void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
struct mshv_lapic_irq *lirq)
{
memset(lirq, 0, sizeof(*lirq));
if (!ent || !ent->girq_entry_valid)
return;
lirq->lapic_vector = ent->girq_irq_data & 0xFF;
lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
}

View File

@ -0,0 +1,83 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <asm/mshyperv.h>
#include "mshv.h"
#include "mshv_root.h"
/*
* Ports and connections are hypervisor struct used for inter-partition
* communication. Port represents the source and connection represents
* the destination. Partitions are responsible for managing the port and
* connection ids.
*
*/
#define PORTID_MIN 1
#define PORTID_MAX INT_MAX
static DEFINE_IDR(port_table_idr);
void
mshv_port_table_fini(void)
{
struct port_table_info *port_info;
unsigned long i, tmp;
idr_lock(&port_table_idr);
if (!idr_is_empty(&port_table_idr)) {
idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) {
port_info = idr_remove(&port_table_idr, i);
kfree_rcu(port_info, portbl_rcu);
}
}
idr_unlock(&port_table_idr);
}
int
mshv_portid_alloc(struct port_table_info *info)
{
int ret = 0;
idr_lock(&port_table_idr);
ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
PORTID_MAX, GFP_KERNEL);
idr_unlock(&port_table_idr);
return ret;
}
void
mshv_portid_free(int port_id)
{
struct port_table_info *info;
idr_lock(&port_table_idr);
info = idr_remove(&port_table_idr, port_id);
WARN_ON(!info);
idr_unlock(&port_table_idr);
synchronize_rcu();
kfree(info);
}
int
mshv_portid_lookup(int port_id, struct port_table_info *info)
{
struct port_table_info *_info;
int ret = -ENOENT;
rcu_read_lock();
_info = idr_find(&port_table_idr, port_id);
rcu_read_unlock();
if (_info) {
*info = *_info;
ret = 0;
}
return ret;
}

311
drivers/hv/mshv_root.h Normal file
View File

@ -0,0 +1,311 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2023, Microsoft Corporation.
*/
#ifndef _MSHV_ROOT_H_
#define _MSHV_ROOT_H_
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/semaphore.h>
#include <linux/sched.h>
#include <linux/srcu.h>
#include <linux/wait.h>
#include <linux/hashtable.h>
#include <linux/dev_printk.h>
#include <linux/build_bug.h>
#include <uapi/linux/mshv.h>
/*
* Hypervisor must be between these version numbers (inclusive)
* to guarantee compatibility
*/
#define MSHV_HV_MIN_VERSION (27744)
#define MSHV_HV_MAX_VERSION (27751)
static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE);
#define MSHV_MAX_VPS 256
#define MSHV_PARTITIONS_HASH_BITS 9
#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE)
struct mshv_vp {
u32 vp_index;
struct mshv_partition *vp_partition;
struct mutex vp_mutex;
struct hv_vp_register_page *vp_register_page;
struct hv_message *vp_intercept_msg_page;
void *vp_ghcb_page;
struct hv_stats_page *vp_stats_pages[2];
struct {
atomic64_t vp_signaled_count;
struct {
u64 intercept_suspend: 1;
u64 root_sched_blocked: 1; /* root scheduler only */
u64 root_sched_dispatched: 1; /* root scheduler only */
u64 reserved: 61;
} flags;
unsigned int kicked_by_hv;
wait_queue_head_t vp_suspend_queue;
} run;
};
#define vp_fmt(fmt) "p%lluvp%u: " fmt
#define vp_devprintk(level, v, fmt, ...) \
do { \
const struct mshv_vp *__vp = (v); \
const struct mshv_partition *__pt = __vp->vp_partition; \
dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \
__vp->vp_index, ##__VA_ARGS__); \
} while (0)
#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__)
#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__)
#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__)
#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__)
#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__)
#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__)
#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
struct mshv_mem_region {
struct hlist_node hnode;
u64 nr_pages;
u64 start_gfn;
u64 start_uaddr;
u32 hv_map_flags;
struct {
u64 large_pages: 1; /* 2MiB */
u64 range_pinned: 1;
u64 reserved: 62;
} flags;
struct mshv_partition *partition;
struct page *pages[];
};
struct mshv_irq_ack_notifier {
struct hlist_node link;
unsigned int irq_ack_gsi;
void (*irq_acked)(struct mshv_irq_ack_notifier *mian);
};
struct mshv_partition {
struct device *pt_module_dev;
struct hlist_node pt_hnode;
u64 pt_id;
refcount_t pt_ref_count;
struct mutex pt_mutex;
struct hlist_head pt_mem_regions; // not ordered
u32 pt_vp_count;
struct mshv_vp *pt_vp_array[MSHV_MAX_VPS];
struct mutex pt_irq_lock;
struct srcu_struct pt_irq_srcu;
struct hlist_head irq_ack_notifier_list;
struct hlist_head pt_devices;
/*
* MSHV does not support more than one async hypercall in flight
* for a single partition. Thus, it is okay to define per partition
* async hypercall status.
*/
struct completion async_hypercall;
u64 async_hypercall_status;
spinlock_t pt_irqfds_lock;
struct hlist_head pt_irqfds_list;
struct mutex irqfds_resampler_lock;
struct hlist_head irqfds_resampler_list;
struct hlist_head ioeventfds_list;
struct mshv_girq_routing_table __rcu *pt_girq_tbl;
u64 isolation_type;
bool import_completed;
bool pt_initialized;
};
#define pt_fmt(fmt) "p%llu: " fmt
#define pt_devprintk(level, p, fmt, ...) \
do { \
const struct mshv_partition *__pt = (p); \
dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \
##__VA_ARGS__); \
} while (0)
#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__)
#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__)
#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__)
#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__)
#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__)
#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__)
#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__)
#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__)
struct mshv_lapic_irq {
u32 lapic_vector;
u64 lapic_apic_id;
union hv_interrupt_control lapic_control;
};
#define MSHV_MAX_GUEST_IRQS 4096
/* representation of one guest irq entry, either msi or legacy */
struct mshv_guest_irq_ent {
u32 girq_entry_valid; /* vfio looks at this */
u32 guest_irq_num; /* a unique number for each irq */
u32 girq_addr_lo; /* guest irq msi address info */
u32 girq_addr_hi;
u32 girq_irq_data; /* idt vector in some cases */
};
struct mshv_girq_routing_table {
u32 num_rt_entries;
struct mshv_guest_irq_ent mshv_girq_info_tbl[];
};
struct hv_synic_pages {
struct hv_message_page *synic_message_page;
struct hv_synic_event_flags_page *synic_event_flags_page;
struct hv_synic_event_ring_page *synic_event_ring_page;
};
struct mshv_root {
struct hv_synic_pages __percpu *synic_pages;
spinlock_t pt_ht_lock;
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
};
/*
* Callback for doorbell events.
* NOTE: This is called in interrupt context. Callback
* should defer slow and sleeping logic to later.
*/
typedef void (*doorbell_cb_t) (int doorbell_id, void *);
/*
* port table information
*/
struct port_table_info {
struct rcu_head portbl_rcu;
enum hv_port_type hv_port_type;
union {
struct {
u64 reserved[2];
} hv_port_message;
struct {
u64 reserved[2];
} hv_port_event;
struct {
u64 reserved[2];
} hv_port_monitor;
struct {
doorbell_cb_t doorbell_cb;
void *data;
} hv_port_doorbell;
};
};
int mshv_update_routing_table(struct mshv_partition *partition,
const struct mshv_user_irq_entry *entries,
unsigned int numents);
void mshv_free_routing_table(struct mshv_partition *partition);
struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition,
u32 irq_num);
void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq,
struct mshv_lapic_irq *dest_irq);
void mshv_irqfd_routing_update(struct mshv_partition *partition);
void mshv_port_table_fini(void);
int mshv_portid_alloc(struct port_table_info *info);
int mshv_portid_lookup(int port_id, struct port_table_info *info);
void mshv_portid_free(int port_id);
int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
void *data, u64 gpa, u64 val, u64 flags);
void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
void mshv_isr(void);
int mshv_synic_init(unsigned int cpu);
int mshv_synic_cleanup(unsigned int cpu);
static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
{
return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP;
}
struct mshv_partition *mshv_partition_get(struct mshv_partition *partition);
void mshv_partition_put(struct mshv_partition *partition);
struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU);
/* hypercalls */
int hv_call_withdraw_memory(u64 count, int node, u64 partition_id);
int hv_call_create_partition(u64 flags,
struct hv_partition_creation_properties creation_properties,
union hv_partition_isolation_properties isolation_properties,
u64 *partition_id);
int hv_call_initialize_partition(u64 partition_id);
int hv_call_finalize_partition(u64 partition_id);
int hv_call_delete_partition(u64 partition_id);
int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs);
int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
u32 flags, struct page **pages);
int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
u32 flags);
int hv_call_delete_vp(u64 partition_id, u32 vp_index);
int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
u64 dest_addr,
union hv_interrupt_control control);
int hv_call_clear_virtual_interrupt(u64 partition_id);
int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
union hv_gpa_page_access_state_flags state_flags,
int *written_total,
union hv_gpa_page_access_state *states);
int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
struct hv_vp_state_data state_data,
/* Choose between pages and ret_output */
u64 page_count, struct page **pages,
union hv_output_get_vp_state *ret_output);
int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
/* Choose between pages and bytes */
struct hv_vp_state_data state_data, u64 page_count,
struct page **pages, u32 num_bytes, u8 *bytes);
int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
union hv_input_vtl input_vtl,
struct page **state_page);
int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
union hv_input_vtl input_vtl);
int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id, struct hv_port_info *port_info,
u8 port_vtl, u8 min_connection_vtl, int node);
int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id);
int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id,
union hv_connection_id connection_id,
struct hv_connection_info *connection_info,
u8 connection_vtl, int node);
int hv_call_disconnect_port(u64 connection_partition_id,
union hv_connection_id connection_id);
int hv_call_notify_port_ring_empty(u32 sint_index);
int hv_call_map_stat_page(enum hv_stats_object_type type,
const union hv_stats_object_identity *identity,
void **addr);
int hv_call_unmap_stat_page(enum hv_stats_object_type type,
const union hv_stats_object_identity *identity);
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire);
extern struct mshv_root mshv_root;
extern enum hv_scheduler_type hv_scheduler_type;
extern u8 * __percpu *hv_synic_eventring_tail;
#endif /* _MSHV_ROOT_H_ */

View File

@ -0,0 +1,849 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* Hypercall helper functions used by the mshv_root module.
*
* Authors: Microsoft Linux virtualization team
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/mshyperv.h>
#include "mshv_root.h"
/* Determined empirically */
#define HV_INIT_PARTITION_DEPOSIT_PAGES 208
#define HV_MAP_GPA_DEPOSIT_PAGES 256
#define HV_UMAP_GPA_PAGES 512
#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64))
#define HV_MAP_GPA_BATCH_SIZE \
((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \
/ sizeof(u64))
#define HV_GET_VP_STATE_BATCH_SIZE \
((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \
/ sizeof(u64))
#define HV_SET_VP_STATE_BATCH_SIZE \
((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \
/ sizeof(u64))
#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \
((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \
/ sizeof(union hv_gpa_page_access_state))
#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \
((HV_HYP_PAGE_SIZE - \
sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \
sizeof(u64))
int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
{
struct hv_input_withdraw_memory *input_page;
struct hv_output_withdraw_memory *output_page;
struct page *page;
u16 completed;
unsigned long remaining = count;
u64 status;
int i;
unsigned long flags;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
output_page = page_address(page);
while (remaining) {
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input_page, 0, sizeof(*input_page));
input_page->partition_id = partition_id;
status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY,
min(remaining, HV_WITHDRAW_BATCH_SIZE),
0, input_page, output_page);
local_irq_restore(flags);
completed = hv_repcomp(status);
for (i = 0; i < completed; i++)
__free_page(pfn_to_page(output_page->gpa_page_list[i]));
if (!hv_result_success(status)) {
if (hv_result(status) == HV_STATUS_NO_RESOURCES)
status = HV_STATUS_SUCCESS;
break;
}
remaining -= completed;
}
free_page((unsigned long)output_page);
return hv_result_to_errno(status);
}
int hv_call_create_partition(u64 flags,
struct hv_partition_creation_properties creation_properties,
union hv_partition_isolation_properties isolation_properties,
u64 *partition_id)
{
struct hv_input_create_partition *input;
struct hv_output_create_partition *output;
u64 status;
int ret;
unsigned long irq_flags;
do {
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
memset(input, 0, sizeof(*input));
input->flags = flags;
input->compatibility_version = HV_COMPATIBILITY_21_H2;
memcpy(&input->partition_creation_properties, &creation_properties,
sizeof(creation_properties));
memcpy(&input->isolation_properties, &isolation_properties,
sizeof(isolation_properties));
status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
input, output);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (hv_result_success(status))
*partition_id = output->partition_id;
local_irq_restore(irq_flags);
ret = hv_result_to_errno(status);
break;
}
local_irq_restore(irq_flags);
ret = hv_call_deposit_pages(NUMA_NO_NODE,
hv_current_partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_initialize_partition(u64 partition_id)
{
struct hv_input_initialize_partition input;
u64 status;
int ret;
input.partition_id = partition_id;
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
HV_INIT_PARTITION_DEPOSIT_PAGES);
if (ret)
return ret;
do {
status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION,
*(u64 *)&input);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_result_to_errno(status);
break;
}
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_finalize_partition(u64 partition_id)
{
struct hv_input_finalize_partition input;
u64 status;
input.partition_id = partition_id;
status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION,
*(u64 *)&input);
return hv_result_to_errno(status);
}
int hv_call_delete_partition(u64 partition_id)
{
struct hv_input_delete_partition input;
u64 status;
input.partition_id = partition_id;
status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input);
return hv_result_to_errno(status);
}
/* Ask the hypervisor to map guest ram pages or the guest mmio space */
static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
u32 flags, struct page **pages, u64 mmio_spa)
{
struct hv_input_map_gpa_pages *input_page;
u64 status, *pfnlist;
unsigned long irq_flags, large_shift = 0;
int ret = 0, done = 0;
u64 page_count = page_struct_count;
if (page_count == 0 || (pages && mmio_spa))
return -EINVAL;
if (flags & HV_MAP_GPA_LARGE_PAGE) {
if (mmio_spa)
return -EINVAL;
if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
return -EINVAL;
large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
page_count >>= large_shift;
}
while (done < page_count) {
ulong i, completed, remain = page_count - done;
int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
input_page->target_partition_id = partition_id;
input_page->target_gpa_base = gfn + (done << large_shift);
input_page->map_flags = flags;
pfnlist = input_page->source_gpa_page_list;
for (i = 0; i < rep_count; i++)
if (flags & HV_MAP_GPA_NO_ACCESS) {
pfnlist[i] = 0;
} else if (pages) {
u64 index = (done + i) << large_shift;
if (index >= page_struct_count) {
ret = -EINVAL;
break;
}
pfnlist[i] = page_to_pfn(pages[index]);
} else {
pfnlist[i] = mmio_spa + done + i;
}
if (ret)
break;
status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
input_page, NULL);
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
HV_MAP_GPA_DEPOSIT_PAGES);
if (ret)
break;
} else if (!hv_result_success(status)) {
ret = hv_result_to_errno(status);
break;
}
done += completed;
}
if (ret && done) {
u32 unmap_flags = 0;
if (flags & HV_MAP_GPA_LARGE_PAGE)
unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
}
return ret;
}
/* Ask the hypervisor to map guest ram pages */
int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
u32 flags, struct page **pages)
{
return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count,
flags, pages, 0);
}
/* Ask the hypervisor to map guest mmio space */
int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs)
{
int i;
u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE |
HV_MAP_GPA_NOT_CACHED;
for (i = 0; i < numpgs; i++)
if (page_is_ram(mmio_spa + i))
return -EINVAL;
return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL,
mmio_spa);
}
int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
u32 flags)
{
struct hv_input_unmap_gpa_pages *input_page;
u64 status, page_count = page_count_4k;
unsigned long irq_flags, large_shift = 0;
int ret = 0, done = 0;
if (page_count == 0)
return -EINVAL;
if (flags & HV_UNMAP_GPA_LARGE_PAGE) {
if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
return -EINVAL;
large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
page_count >>= large_shift;
}
while (done < page_count) {
ulong completed, remain = page_count - done;
int rep_count = min(remain, HV_UMAP_GPA_PAGES);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
input_page->target_partition_id = partition_id;
input_page->target_gpa_base = gfn + (done << large_shift);
input_page->unmap_flags = flags;
status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count,
0, input_page, NULL);
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
if (!hv_result_success(status)) {
ret = hv_result_to_errno(status);
break;
}
done += completed;
}
return ret;
}
int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
union hv_gpa_page_access_state_flags state_flags,
int *written_total,
union hv_gpa_page_access_state *states)
{
struct hv_input_get_gpa_pages_access_state *input_page;
union hv_gpa_page_access_state *output_page;
int completed = 0;
unsigned long remaining = count;
int rep_count, i;
u64 status = 0;
unsigned long flags;
*written_total = 0;
while (remaining) {
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
input_page->partition_id = partition_id;
input_page->hv_gpa_page_number = gpa_base_pfn + *written_total;
input_page->flags = state_flags;
rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE);
status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count,
0, input_page, output_page);
if (!hv_result_success(status)) {
local_irq_restore(flags);
break;
}
completed = hv_repcomp(status);
for (i = 0; i < completed; ++i)
states[i].as_uint8 = output_page[i].as_uint8;
local_irq_restore(flags);
states += completed;
*written_total += completed;
remaining -= completed;
}
return hv_result_to_errno(status);
}
int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
u64 dest_addr,
union hv_interrupt_control control)
{
struct hv_input_assert_virtual_interrupt *input;
unsigned long flags;
u64 status;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vector = vector;
input->dest_addr = dest_addr;
input->control = control;
status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
local_irq_restore(flags);
return hv_result_to_errno(status);
}
int hv_call_delete_vp(u64 partition_id, u32 vp_index)
{
union hv_input_delete_vp input = {};
u64 status;
input.partition_id = partition_id;
input.vp_index = vp_index;
status = hv_do_fast_hypercall16(HVCALL_DELETE_VP,
input.as_uint64[0], input.as_uint64[1]);
return hv_result_to_errno(status);
}
EXPORT_SYMBOL_GPL(hv_call_delete_vp);
int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
struct hv_vp_state_data state_data,
/* Choose between pages and ret_output */
u64 page_count, struct page **pages,
union hv_output_get_vp_state *ret_output)
{
struct hv_input_get_vp_state *input;
union hv_output_get_vp_state *output;
u64 status;
int i;
u64 control;
unsigned long flags;
int ret = 0;
if (page_count > HV_GET_VP_STATE_BATCH_SIZE)
return -EINVAL;
if (!page_count && !ret_output)
return -EINVAL;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
memset(input, 0, sizeof(*input));
memset(output, 0, sizeof(*output));
input->partition_id = partition_id;
input->vp_index = vp_index;
input->state_data = state_data;
for (i = 0; i < page_count; i++)
input->output_data_pfns[i] = page_to_pfn(pages[i]);
control = (HVCALL_GET_VP_STATE) |
(page_count << HV_HYPERCALL_VARHEAD_OFFSET);
status = hv_do_hypercall(control, input, output);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (hv_result_success(status) && ret_output)
memcpy(ret_output, output, sizeof(*output));
local_irq_restore(flags);
ret = hv_result_to_errno(status);
break;
}
local_irq_restore(flags);
ret = hv_call_deposit_pages(NUMA_NO_NODE,
partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
/* Choose between pages and bytes */
struct hv_vp_state_data state_data, u64 page_count,
struct page **pages, u32 num_bytes, u8 *bytes)
{
struct hv_input_set_vp_state *input;
u64 status;
int i;
u64 control;
unsigned long flags;
int ret = 0;
u16 varhead_sz;
if (page_count > HV_SET_VP_STATE_BATCH_SIZE)
return -EINVAL;
if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE)
return -EINVAL;
if (num_bytes)
/* round up to 8 and divide by 8 */
varhead_sz = (num_bytes + 7) >> 3;
else if (page_count)
varhead_sz = page_count;
else
return -EINVAL;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vp_index = vp_index;
input->state_data = state_data;
if (num_bytes) {
memcpy((u8 *)input->data, bytes, num_bytes);
} else {
for (i = 0; i < page_count; i++)
input->data[i].pfns = page_to_pfn(pages[i]);
}
control = (HVCALL_SET_VP_STATE) |
(varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET);
status = hv_do_hypercall(control, input, NULL);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
local_irq_restore(flags);
ret = hv_result_to_errno(status);
break;
}
local_irq_restore(flags);
ret = hv_call_deposit_pages(NUMA_NO_NODE,
partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
union hv_input_vtl input_vtl,
struct page **state_page)
{
struct hv_input_map_vp_state_page *input;
struct hv_output_map_vp_state_page *output;
u64 status;
int ret;
unsigned long flags;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
input->partition_id = partition_id;
input->vp_index = vp_index;
input->type = type;
input->input_vtl = input_vtl;
status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (hv_result_success(status))
*state_page = pfn_to_page(output->map_location);
local_irq_restore(flags);
ret = hv_result_to_errno(status);
break;
}
local_irq_restore(flags);
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
union hv_input_vtl input_vtl)
{
unsigned long flags;
u64 status;
struct hv_input_unmap_vp_state_page *input;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vp_index = vp_index;
input->type = type;
input->input_vtl = input_vtl;
status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL);
local_irq_restore(flags);
return hv_result_to_errno(status);
}
int
hv_call_clear_virtual_interrupt(u64 partition_id)
{
int status;
status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT,
partition_id);
return hv_result_to_errno(status);
}
int
hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id,
struct hv_port_info *port_info,
u8 port_vtl, u8 min_connection_vtl, int node)
{
struct hv_input_create_port *input;
unsigned long flags;
int ret = 0;
int status;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->port_partition_id = port_partition_id;
input->port_id = port_id;
input->connection_partition_id = connection_partition_id;
input->port_info = *port_info;
input->port_vtl = port_vtl;
input->min_connection_vtl = min_connection_vtl;
input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL);
local_irq_restore(flags);
if (hv_result_success(status))
break;
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_result_to_errno(status);
break;
}
ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
} while (!ret);
return ret;
}
int
hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id)
{
union hv_input_delete_port input = { 0 };
int status;
input.port_partition_id = port_partition_id;
input.port_id = port_id;
status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT,
input.as_uint64[0],
input.as_uint64[1]);
return hv_result_to_errno(status);
}
int
hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id,
union hv_connection_id connection_id,
struct hv_connection_info *connection_info,
u8 connection_vtl, int node)
{
struct hv_input_connect_port *input;
unsigned long flags;
int ret = 0, status;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->port_partition_id = port_partition_id;
input->port_id = port_id;
input->connection_partition_id = connection_partition_id;
input->connection_id = connection_id;
input->connection_info = *connection_info;
input->connection_vtl = connection_vtl;
input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL);
local_irq_restore(flags);
if (hv_result_success(status))
break;
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_result_to_errno(status);
break;
}
ret = hv_call_deposit_pages(NUMA_NO_NODE,
connection_partition_id, 1);
} while (!ret);
return ret;
}
int
hv_call_disconnect_port(u64 connection_partition_id,
union hv_connection_id connection_id)
{
union hv_input_disconnect_port input = { 0 };
int status;
input.connection_partition_id = connection_partition_id;
input.connection_id = connection_id;
input.is_doorbell = 1;
status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT,
input.as_uint64[0],
input.as_uint64[1]);
return hv_result_to_errno(status);
}
int
hv_call_notify_port_ring_empty(u32 sint_index)
{
union hv_input_notify_port_ring_empty input = { 0 };
int status;
input.sint_index = sint_index;
status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY,
input.as_uint64);
return hv_result_to_errno(status);
}
int hv_call_map_stat_page(enum hv_stats_object_type type,
const union hv_stats_object_identity *identity,
void **addr)
{
unsigned long flags;
struct hv_input_map_stats_page *input;
struct hv_output_map_stats_page *output;
u64 status, pfn;
int ret = 0;
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
memset(input, 0, sizeof(*input));
input->type = type;
input->identity = *identity;
status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output);
pfn = output->map_location;
local_irq_restore(flags);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_result_to_errno(status);
if (hv_result_success(status))
break;
return ret;
}
ret = hv_call_deposit_pages(NUMA_NO_NODE,
hv_current_partition_id, 1);
if (ret)
return ret;
} while (!ret);
*addr = page_address(pfn_to_page(pfn));
return ret;
}
int hv_call_unmap_stat_page(enum hv_stats_object_type type,
const union hv_stats_object_identity *identity)
{
unsigned long flags;
struct hv_input_unmap_stats_page *input;
u64 status;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->type = type;
input->identity = *identity;
status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL);
local_irq_restore(flags);
return hv_result_to_errno(status);
}
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire)
{
struct hv_input_modify_sparse_spa_page_host_access *input_page;
u64 status;
int done = 0;
unsigned long irq_flags, large_shift = 0;
u64 page_count = page_struct_count;
u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS;
if (page_count == 0)
return -EINVAL;
if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) {
if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
return -EINVAL;
large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
page_count >>= large_shift;
}
while (done < page_count) {
ulong i, completed, remain = page_count - done;
int rep_count = min(remain,
HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input_page, 0, sizeof(*input_page));
/* Only set the partition id if you are making the pages
* exclusive
*/
if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE)
input_page->partition_id = partition_id;
input_page->flags = flags;
input_page->host_access = host_access;
for (i = 0; i < rep_count; i++) {
u64 index = (done + i) << large_shift;
if (index >= page_struct_count)
return -EINVAL;
input_page->spa_page_list[i] =
page_to_pfn(pages[index]);
}
status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
NULL);
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
if (!hv_result_success(status))
return hv_result_to_errno(status);
done += completed;
}
return 0;
}

2307
drivers/hv/mshv_root_main.c Normal file

File diff suppressed because it is too large Load Diff

665
drivers/hv/mshv_synic.c Normal file
View File

@ -0,0 +1,665 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* mshv_root module's main interrupt handler and associated functionality.
*
* Authors: Microsoft Linux virtualization team
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/random.h>
#include <asm/mshyperv.h>
#include "mshv_eventfd.h"
#include "mshv.h"
static u32 synic_event_ring_get_queued_port(u32 sint_index)
{
struct hv_synic_event_ring_page **event_ring_page;
volatile struct hv_synic_event_ring *ring;
struct hv_synic_pages *spages;
u8 **synic_eventring_tail;
u32 message;
u8 tail;
spages = this_cpu_ptr(mshv_root.synic_pages);
event_ring_page = &spages->synic_event_ring_page;
synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
if (unlikely(!*synic_eventring_tail)) {
pr_debug("Missing synic event ring tail!\n");
return 0;
}
tail = (*synic_eventring_tail)[sint_index];
if (unlikely(!*event_ring_page)) {
pr_debug("Missing synic event ring page!\n");
return 0;
}
ring = &(*event_ring_page)->sint_event_ring[sint_index];
/*
* Get the message.
*/
message = ring->data[tail];
if (!message) {
if (ring->ring_full) {
/*
* Ring is marked full, but we would have consumed all
* the messages. Notify the hypervisor that ring is now
* empty and check again.
*/
ring->ring_full = 0;
hv_call_notify_port_ring_empty(sint_index);
message = ring->data[tail];
}
if (!message) {
ring->signal_masked = 0;
/*
* Unmask the signal and sync with hypervisor
* before one last check for any message.
*/
mb();
message = ring->data[tail];
/*
* Ok, lets bail out.
*/
if (!message)
return 0;
}
ring->signal_masked = 1;
}
/*
* Clear the message in the ring buffer.
*/
ring->data[tail] = 0;
if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT)
tail = 0;
(*synic_eventring_tail)[sint_index] = tail;
return message;
}
static bool
mshv_doorbell_isr(struct hv_message *msg)
{
struct hv_notification_message_payload *notification;
u32 port;
if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT)
return false;
notification = (struct hv_notification_message_payload *)msg->u.payload;
if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX)
return false;
while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) {
struct port_table_info ptinfo = { 0 };
if (mshv_portid_lookup(port, &ptinfo)) {
pr_debug("Failed to get port info from port_table!\n");
continue;
}
if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) {
pr_debug("Not a doorbell port!, port: %d, port_type: %d\n",
port, ptinfo.hv_port_type);
continue;
}
/* Invoke the callback */
ptinfo.hv_port_doorbell.doorbell_cb(port,
ptinfo.hv_port_doorbell.data);
}
return true;
}
static bool mshv_async_call_completion_isr(struct hv_message *msg)
{
bool handled = false;
struct hv_async_completion_message_payload *async_msg;
struct mshv_partition *partition;
u64 partition_id;
if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION)
goto out;
async_msg =
(struct hv_async_completion_message_payload *)msg->u.payload;
partition_id = async_msg->partition_id;
/*
* Hold this lock for the rest of the isr, because the partition could
* be released anytime.
* e.g. the MSHV_RUN_VP thread could wake on another cpu; it could
* release the partition unless we hold this!
*/
rcu_read_lock();
partition = mshv_partition_find(partition_id);
if (unlikely(!partition)) {
pr_debug("failed to find partition %llu\n", partition_id);
goto unlock_out;
}
partition->async_hypercall_status = async_msg->status;
complete(&partition->async_hypercall);
handled = true;
unlock_out:
rcu_read_unlock();
out:
return handled;
}
static void kick_vp(struct mshv_vp *vp)
{
atomic64_inc(&vp->run.vp_signaled_count);
vp->run.kicked_by_hv = 1;
wake_up(&vp->run.vp_suspend_queue);
}
static void
handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
{
int bank_idx, vps_signaled = 0, bank_mask_size;
struct mshv_partition *partition;
const struct hv_vpset *vpset;
const u64 *bank_contents;
u64 partition_id = msg->partition_id;
if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) {
pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K");
return;
}
if (msg->vp_count == 0) {
pr_debug("scheduler message with no VP specified");
return;
}
rcu_read_lock();
partition = mshv_partition_find(partition_id);
if (unlikely(!partition)) {
pr_debug("failed to find partition %llu\n", partition_id);
goto unlock_out;
}
vpset = &msg->vp_bitset.bitset;
bank_idx = -1;
bank_contents = vpset->bank_contents;
bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE;
while (true) {
int vp_bank_idx = -1;
int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE;
int vp_index;
bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask,
bank_mask_size, bank_idx + 1);
if (bank_idx == bank_mask_size)
break;
while (true) {
struct mshv_vp *vp;
vp_bank_idx = find_next_bit((unsigned long *)bank_contents,
vp_bank_size, vp_bank_idx + 1);
if (vp_bank_idx == vp_bank_size)
break;
vp_index = (bank_idx * vp_bank_size) + vp_bank_idx;
/* This shouldn't happen, but just in case. */
if (unlikely(vp_index >= MSHV_MAX_VPS)) {
pr_debug("VP index %u out of bounds\n",
vp_index);
goto unlock_out;
}
vp = partition->pt_vp_array[vp_index];
if (unlikely(!vp)) {
pr_debug("failed to find VP %u\n", vp_index);
goto unlock_out;
}
kick_vp(vp);
vps_signaled++;
}
bank_contents++;
}
unlock_out:
rcu_read_unlock();
if (vps_signaled != msg->vp_count)
pr_debug("asked to signal %u VPs but only did %u\n",
msg->vp_count, vps_signaled);
}
static void
handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
{
struct mshv_partition *partition = NULL;
struct mshv_vp *vp;
int idx;
rcu_read_lock();
for (idx = 0; idx < msg->vp_count; idx++) {
u64 partition_id = msg->partition_ids[idx];
u32 vp_index = msg->vp_indexes[idx];
if (idx == 0 || partition->pt_id != partition_id) {
partition = mshv_partition_find(partition_id);
if (unlikely(!partition)) {
pr_debug("failed to find partition %llu\n",
partition_id);
break;
}
}
/* This shouldn't happen, but just in case. */
if (unlikely(vp_index >= MSHV_MAX_VPS)) {
pr_debug("VP index %u out of bounds\n", vp_index);
break;
}
vp = partition->pt_vp_array[vp_index];
if (!vp) {
pr_debug("failed to find VP %u\n", vp_index);
break;
}
kick_vp(vp);
}
rcu_read_unlock();
}
static bool
mshv_scheduler_isr(struct hv_message *msg)
{
if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET &&
msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR)
return false;
if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET)
handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *)
msg->u.payload);
else
handle_pair_message((struct hv_vp_signal_pair_scheduler_message *)
msg->u.payload);
return true;
}
static bool
mshv_intercept_isr(struct hv_message *msg)
{
struct mshv_partition *partition;
bool handled = false;
struct mshv_vp *vp;
u64 partition_id;
u32 vp_index;
partition_id = msg->header.sender;
rcu_read_lock();
partition = mshv_partition_find(partition_id);
if (unlikely(!partition)) {
pr_debug("failed to find partition %llu\n",
partition_id);
goto unlock_out;
}
if (msg->header.message_type == HVMSG_X64_APIC_EOI) {
/*
* Check if this gsi is registered in the
* ack_notifier list and invoke the callback
* if registered.
*/
/*
* If there is a notifier, the ack callback is supposed
* to handle the VMEXIT. So we need not pass this message
* to vcpu thread.
*/
struct hv_x64_apic_eoi_message *eoi_msg =
(struct hv_x64_apic_eoi_message *)&msg->u.payload[0];
if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) {
handled = true;
goto unlock_out;
}
}
/*
* We should get an opaque intercept message here for all intercept
* messages, since we're using the mapped VP intercept message page.
*
* The intercept message will have been placed in intercept message
* page at this point.
*
* Make sure the message type matches our expectation.
*/
if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) {
pr_debug("wrong message type %d", msg->header.message_type);
goto unlock_out;
}
/*
* Since we directly index the vp, and it has to exist for us to be here
* (because the vp is only deleted when the partition is), no additional
* locking is needed here
*/
vp_index =
((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
vp = partition->pt_vp_array[vp_index];
if (unlikely(!vp)) {
pr_debug("failed to find VP %u\n", vp_index);
goto unlock_out;
}
kick_vp(vp);
handled = true;
unlock_out:
rcu_read_unlock();
return handled;
}
void mshv_isr(void)
{
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
struct hv_message_page **msg_page = &spages->synic_message_page;
struct hv_message *msg;
bool handled;
if (unlikely(!(*msg_page))) {
pr_debug("Missing synic page!\n");
return;
}
msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]);
/*
* If the type isn't set, there isn't really a message;
* it may be some other hyperv interrupt
*/
if (msg->header.message_type == HVMSG_NONE)
return;
handled = mshv_doorbell_isr(msg);
if (!handled)
handled = mshv_scheduler_isr(msg);
if (!handled)
handled = mshv_async_call_completion_isr(msg);
if (!handled)
handled = mshv_intercept_isr(msg);
if (handled) {
/*
* Acknowledge message with hypervisor if another message is
* pending.
*/
msg->header.message_type = HVMSG_NONE;
/*
* Ensure the write is complete so the hypervisor will deliver
* the next message if available.
*/
mb();
if (msg->header.message_flags.msg_pending)
hv_set_non_nested_msr(HV_MSR_EOM, 0);
#ifdef HYPERVISOR_CALLBACK_VECTOR
add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
#endif
} else {
pr_warn_once("%s: unknown message type 0x%x\n", __func__,
msg->header.message_type);
}
}
int mshv_synic_init(unsigned int cpu)
{
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
#ifdef HYPERVISOR_CALLBACK_VECTOR
union hv_synic_sint sint;
#endif
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
struct hv_message_page **msg_page = &spages->synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
&spages->synic_event_ring_page;
/* Setup the Synic's message page */
simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
simp.simp_enabled = true;
*msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
HV_HYP_PAGE_SIZE,
MEMREMAP_WB);
if (!(*msg_page))
return -EFAULT;
hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
/* Setup the Synic's event flags page */
siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = true;
*event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT,
PAGE_SIZE, MEMREMAP_WB);
if (!(*event_flags_page))
goto cleanup;
hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
/* Setup the Synic's event ring page */
sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
sirbp.sirbp_enabled = true;
*event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT,
PAGE_SIZE, MEMREMAP_WB);
if (!(*event_ring_page))
goto cleanup;
hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
#ifdef HYPERVISOR_CALLBACK_VECTOR
/* Enable intercepts */
sint.as_uint64 = 0;
sint.vector = HYPERVISOR_CALLBACK_VECTOR;
sint.masked = false;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
sint.as_uint64);
/* Doorbell SINT */
sint.as_uint64 = 0;
sint.vector = HYPERVISOR_CALLBACK_VECTOR;
sint.masked = false;
sint.as_intercept = 1;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
#endif
/* Enable global synic bit */
sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
sctrl.enable = 1;
hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
return 0;
cleanup:
if (*event_ring_page) {
sirbp.sirbp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
memunmap(*event_ring_page);
}
if (*event_flags_page) {
siefp.siefp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
memunmap(*event_flags_page);
}
if (*msg_page) {
simp.simp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
memunmap(*msg_page);
}
return -EFAULT;
}
int mshv_synic_cleanup(unsigned int cpu)
{
union hv_synic_sint sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
struct hv_message_page **msg_page = &spages->synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
&spages->synic_event_ring_page;
/* Disable the interrupt */
sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX);
sint.masked = true;
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
sint.as_uint64);
/* Disable Doorbell SINT */
sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX);
sint.masked = true;
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
/* Disable Synic's event ring page */
sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
sirbp.sirbp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
memunmap(*event_ring_page);
/* Disable Synic's event flags page */
siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
memunmap(*event_flags_page);
/* Disable Synic's message page */
simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
simp.simp_enabled = false;
hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
memunmap(*msg_page);
/* Disable global synic bit */
sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
sctrl.enable = 0;
hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
return 0;
}
int
mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data,
u64 gpa, u64 val, u64 flags)
{
struct hv_connection_info connection_info = { 0 };
union hv_connection_id connection_id = { 0 };
struct port_table_info *port_table_info;
struct hv_port_info port_info = { 0 };
union hv_port_id port_id = { 0 };
int ret;
port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL);
if (!port_table_info)
return -ENOMEM;
port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL;
port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb;
port_table_info->hv_port_doorbell.data = data;
ret = mshv_portid_alloc(port_table_info);
if (ret < 0) {
kfree(port_table_info);
return ret;
}
port_id.u.id = ret;
port_info.port_type = HV_PORT_TYPE_DOORBELL;
port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX;
port_info.doorbell_port_info.target_vp = HV_ANY_VP;
ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id,
&port_info,
0, 0, NUMA_NO_NODE);
if (ret < 0) {
mshv_portid_free(port_id.u.id);
return ret;
}
connection_id.u.id = port_id.u.id;
connection_info.port_type = HV_PORT_TYPE_DOORBELL;
connection_info.doorbell_connection_info.gpa = gpa;
connection_info.doorbell_connection_info.trigger_value = val;
connection_info.doorbell_connection_info.flags = flags;
ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id,
connection_id, &connection_info, 0, NUMA_NO_NODE);
if (ret < 0) {
hv_call_delete_port(hv_current_partition_id, port_id);
mshv_portid_free(port_id.u.id);
return ret;
}
// lets use the port_id as the doorbell_id
return port_id.u.id;
}
void
mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
{
union hv_port_id port_id = { 0 };
union hv_connection_id connection_id = { 0 };
connection_id.u.id = doorbell_portid;
hv_call_disconnect_port(partition_id, connection_id);
port_id.u.id = doorbell_portid;
hv_call_delete_port(hv_current_partition_id, port_id);
mshv_portid_free(doorbell_portid);
}

View File

@ -1611,18 +1611,18 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
{
return sprintf(buf, "%u\n", channel->target_cpu);
}
static ssize_t target_cpu_store(struct vmbus_channel *channel,
const char *buf, size_t count)
int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
{
u32 target_cpu, origin_cpu;
ssize_t ret = count;
u32 origin_cpu;
int ret = 0;
lockdep_assert_cpus_held();
lockdep_assert_held(&vmbus_connection.channel_mutex);
if (vmbus_proto_version < VERSION_WIN10_V4_1)
return -EIO;
if (sscanf(buf, "%uu", &target_cpu) != 1)
return -EIO;
/* Validate target_cpu for the cpumask_test_cpu() operation below. */
if (target_cpu >= nr_cpumask_bits)
return -EINVAL;
@ -1630,22 +1630,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
return -EINVAL;
/* No CPUs should come up or down during this. */
cpus_read_lock();
if (!cpu_online(target_cpu)) {
cpus_read_unlock();
if (!cpu_online(target_cpu))
return -EINVAL;
}
/*
* Synchronizes target_cpu_store() and channel closure:
* Synchronizes vmbus_channel_set_cpu() and channel closure:
*
* { Initially: state = CHANNEL_OPENED }
*
* CPU1 CPU2
*
* [target_cpu_store()] [vmbus_disconnect_ring()]
* [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()]
*
* LOCK channel_mutex LOCK channel_mutex
* LOAD r1 = state LOAD r2 = state
@ -1660,7 +1655,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
* Note. The host processes the channel messages "sequentially", in
* the order in which they are received on a per-partition basis.
*/
mutex_lock(&vmbus_connection.channel_mutex);
/*
* Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
@ -1668,17 +1662,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
*/
if (channel->state != CHANNEL_OPENED_STATE) {
ret = -EIO;
goto cpu_store_unlock;
goto end;
}
origin_cpu = channel->target_cpu;
if (target_cpu == origin_cpu)
goto cpu_store_unlock;
goto end;
if (vmbus_send_modifychannel(channel,
hv_cpu_number_to_vp_number(target_cpu))) {
ret = -EIO;
goto cpu_store_unlock;
goto end;
}
/*
@ -1708,10 +1702,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
origin_cpu, target_cpu);
}
cpu_store_unlock:
end:
return ret;
}
static ssize_t target_cpu_store(struct vmbus_channel *channel,
const char *buf, size_t count)
{
u32 target_cpu;
ssize_t ret;
if (sscanf(buf, "%uu", &target_cpu) != 1)
return -EIO;
cpus_read_lock();
mutex_lock(&vmbus_connection.channel_mutex);
ret = vmbus_channel_set_cpu(channel, target_cpu);
mutex_unlock(&vmbus_connection.channel_mutex);
cpus_read_unlock();
return ret;
return ret ?: count;
}
static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
@ -2659,7 +2669,7 @@ static int __init hv_acpi_init(void)
if (!hv_is_hyperv_initialized())
return -ENODEV;
if (hv_root_partition && !hv_nested)
if (hv_root_partition() && !hv_nested)
return 0;
/*

View File

@ -130,7 +130,7 @@ static int __init hyperv_prepare_irq_remapping(void)
x86_init.hyper.msi_ext_dest_id())
return -ENODEV;
if (hv_root_partition) {
if (hv_root_partition()) {
name = "HYPERV-ROOT-IR";
ops = &hyperv_root_ir_domain_ops;
} else {
@ -151,7 +151,7 @@ static int __init hyperv_prepare_irq_remapping(void)
return -ENOMEM;
}
if (hv_root_partition)
if (hv_root_partition())
return 0; /* The rest is only relevant to guests */
/*
@ -217,7 +217,7 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
status = hv_unmap_ioapic_interrupt(ioapic_id, &entry);
if (status != HV_STATUS_SUCCESS)
pr_debug("%s: unexpected unmap status %lld\n", __func__, status);
hv_status_debug(status, "failed to unmap\n");
data->entry.ioapic_rte.as_uint64 = 0;
data->entry.source = 0; /* Invalid source */
@ -228,7 +228,7 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
vector, &entry);
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: map hypercall failed, status %lld\n", __func__, status);
hv_status_err(status, "map failed\n");
return;
}

View File

@ -28,9 +28,15 @@
#define VTPM_BASE_ADDRESS 0xfed40000
enum hv_partition_type {
HV_PARTITION_TYPE_GUEST,
HV_PARTITION_TYPE_ROOT,
};
struct ms_hyperv_info {
u32 features;
u32 priv_high;
u32 ext_features;
u32 misc_features;
u32 hints;
u32 nested_features;
@ -58,15 +64,32 @@ struct ms_hyperv_info {
};
extern struct ms_hyperv_info ms_hyperv;
extern bool hv_nested;
extern u64 hv_current_partition_id;
extern enum hv_partition_type hv_curr_partition_type;
extern void * __percpu *hyperv_pcpu_input_arg;
extern void * __percpu *hyperv_pcpu_output_arg;
extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
u64 hv_do_fast_hypercall8(u16 control, u64 input8);
u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2);
bool hv_isolation_type_snp(void);
bool hv_isolation_type_tdx(void);
/*
* On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
* it doesn't provide a recommendation flag and AEOI must be disabled.
*/
static inline bool hv_recommend_using_aeoi(void)
{
#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
#else
return false;
#endif
}
static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node)
{
struct hv_proximity_domain_info pxm_info = {};
@ -185,12 +208,11 @@ void hv_setup_kexec_handler(void (*handler)(void));
void hv_remove_kexec_handler(void);
void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
void hv_remove_crash_handler(void);
void hv_setup_mshv_handler(void (*handler)(void));
extern int vmbus_interrupt;
extern int vmbus_irq;
extern bool hv_root_partition;
#if IS_ENABLED(CONFIG_HYPERV)
/*
* Hypervisor's notion of virtual processor ID is different from
@ -207,10 +229,12 @@ extern u64 (*hv_read_reference_counter)(void);
#define VP_INVAL U32_MAX
int __init hv_common_init(void);
void __init hv_get_partition_id(void);
void __init hv_common_free(void);
void __init ms_hyperv_late_init(void);
int hv_common_cpu_init(unsigned int cpu);
int hv_common_cpu_die(unsigned int cpu);
void hv_identify_partition_type(void);
void *hv_alloc_hyperv_page(void);
void *hv_alloc_hyperv_zeroed_page(void);
@ -291,6 +315,20 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset,
return __cpumask_to_vpset(vpset, cpus, func);
}
#define _hv_status_fmt(fmt) "%s: Hyper-V status: %#x = %s: " fmt
#define hv_status_printk(level, status, fmt, ...) \
do { \
u64 __status = (status); \
pr_##level(_hv_status_fmt(fmt), __func__, hv_result(__status), \
hv_result_to_string(__status), ##__VA_ARGS__); \
} while (0)
#define hv_status_err(status, fmt, ...) \
hv_status_printk(err, status, fmt, ##__VA_ARGS__)
#define hv_status_debug(status, fmt, ...) \
hv_status_printk(debug, status, fmt, ##__VA_ARGS__)
const char *hv_result_to_string(u64 hv_status);
int hv_result_to_errno(u64 status);
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
bool hv_is_hyperv_initialized(void);
bool hv_is_hibernation_supported(void);
@ -303,6 +341,7 @@ void hyperv_cleanup(void);
bool hv_query_ext_cap(u64 cap_query);
void hv_setup_dma_ops(struct device *dev, bool coherent);
#else /* CONFIG_HYPERV */
static inline void hv_identify_partition_type(void) {}
static inline bool hv_is_hyperv_initialized(void) { return false; }
static inline bool hv_is_hibernation_supported(void) { return false; }
static inline void hyperv_cleanup(void) {}
@ -314,4 +353,29 @@ static inline enum hv_isolation_type hv_get_isolation_type(void)
}
#endif /* CONFIG_HYPERV */
#if IS_ENABLED(CONFIG_MSHV_ROOT)
static inline bool hv_root_partition(void)
{
return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT;
}
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
#else /* CONFIG_MSHV_ROOT */
static inline bool hv_root_partition(void) { return false; }
static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
{
return -EOPNOTSUPP;
}
static inline int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id)
{
return -EOPNOTSUPP;
}
static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_MSHV_ROOT */
#endif

View File

@ -13,7 +13,7 @@ struct hv_u128 {
u64 high_part;
} __packed;
/* NOTE: when adding below, update hv_status_to_string() */
/* NOTE: when adding below, update hv_result_to_string() */
#define HV_STATUS_SUCCESS 0x0
#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2
#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3
@ -51,6 +51,7 @@ struct hv_u128 {
#define HV_HYP_PAGE_SHIFT 12
#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT)
#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1))
#define HV_HYP_LARGE_PAGE_SHIFT 21
#define HV_PARTITION_ID_INVALID ((u64)0)
#define HV_PARTITION_ID_SELF ((u64)-1)
@ -182,7 +183,7 @@ struct hv_tsc_emulation_control { /* HV_TSC_INVARIANT_CONTROL */
#endif /* CONFIG_X86 */
struct hv_get_partition_id { /* HV_OUTPUT_GET_PARTITION_ID */
struct hv_output_get_partition_id {
u64 partition_id;
} __packed;
@ -204,7 +205,14 @@ union hv_reference_tsc_msr {
/* The number of vCPUs in one sparse bank */
#define HV_VCPUS_PER_SPARSE_BANK (64)
/* Some of Hyper-V structs do not use hv_vpset where linux uses them */
/*
* Some of Hyper-V structs do not use hv_vpset where linux uses them.
*
* struct hv_vpset is usually used as part of hypercall input. The portion
* that counts as "fixed size input header" vs. "variable size input header"
* varies per hypercall. See comments at relevant hypercall call sites as to
* how the "valid_bank_mask" field should be accounted.
*/
struct hv_vpset { /* HV_VP_SET */
u64 format;
u64 valid_bank_mask;
@ -374,6 +382,10 @@ union hv_hypervisor_version_info {
#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5)
#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6)
/* HYPERV_CPUID_FEATURES.ECX bits. */
#define HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE BIT(9)
#define HV_VP_GHCB_ROOT_MAPPING_AVAILABLE BIT(10)
enum hv_isolation_type {
HV_ISOLATION_TYPE_NONE = 0, /* HV_PARTITION_ISOLATION_TYPE_NONE */
HV_ISOLATION_TYPE_VBS = 1,
@ -436,10 +448,13 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_WITHDRAW_MEMORY 0x0049
#define HVCALL_MAP_GPA_PAGES 0x004b
#define HVCALL_UNMAP_GPA_PAGES 0x004c
#define HVCALL_INSTALL_INTERCEPT 0x004d
#define HVCALL_CREATE_VP 0x004e
#define HVCALL_DELETE_VP 0x004f
#define HVCALL_GET_VP_REGISTERS 0x0050
#define HVCALL_SET_VP_REGISTERS 0x0051
#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052
#define HVCALL_CLEAR_VIRTUAL_INTERRUPT 0x0056
#define HVCALL_DELETE_PORT 0x0058
#define HVCALL_DISCONNECT_PORT 0x005b
#define HVCALL_POST_MESSAGE 0x005c
@ -447,12 +462,15 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_POST_DEBUG_DATA 0x0069
#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a
#define HVCALL_RESET_DEBUG_SESSION 0x006b
#define HVCALL_MAP_STATS_PAGE 0x006c
#define HVCALL_UNMAP_STATS_PAGE 0x006d
#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076
#define HVCALL_GET_SYSTEM_PROPERTY 0x007b
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
#define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b
#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091
#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094
#define HVCALL_CREATE_PORT 0x0095
#define HVCALL_CONNECT_PORT 0x0096
@ -460,12 +478,18 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0
#define HVCALL_POST_MESSAGE_DIRECT 0x00c1
#define HVCALL_DISPATCH_VP 0x00c2
#define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9
#define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7
#define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8
#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
#define HVCALL_MAP_VP_STATE_PAGE 0x00e1
#define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2
#define HVCALL_GET_VP_STATE 0x00e3
#define HVCALL_SET_VP_STATE 0x00e4
#define HVCALL_GET_VP_CPUID_VALUES 0x00f4
#define HVCALL_MMIO_READ 0x0106
#define HVCALL_MMIO_WRITE 0x0107
@ -775,10 +799,10 @@ struct hv_message_page {
/* Define timer message payload structure. */
struct hv_timer_message_payload {
__u32 timer_index;
__u32 reserved;
__u64 expiration_time; /* When the timer expired */
__u64 delivery_time; /* When the message was delivered */
u32 timer_index;
u32 reserved;
u64 expiration_time; /* When the timer expired */
u64 delivery_time; /* When the message was delivered */
} __packed;
struct hv_x64_segment_register {
@ -807,6 +831,8 @@ struct hv_x64_table_register {
u64 base;
} __packed;
#define HV_NORMAL_VTL 0
union hv_input_vtl {
u8 as_uint8;
struct {
@ -1325,6 +1351,49 @@ struct hv_retarget_device_interrupt { /* HV_INPUT_RETARGET_DEVICE_INTERRUPT */
struct hv_device_interrupt_target int_target;
} __packed __aligned(8);
enum hv_intercept_type {
#if defined(CONFIG_X86)
HV_INTERCEPT_TYPE_X64_IO_PORT = 0x00000000,
HV_INTERCEPT_TYPE_X64_MSR = 0x00000001,
HV_INTERCEPT_TYPE_X64_CPUID = 0x00000002,
#endif
HV_INTERCEPT_TYPE_EXCEPTION = 0x00000003,
/* Used to be HV_INTERCEPT_TYPE_REGISTER */
HV_INTERCEPT_TYPE_RESERVED0 = 0x00000004,
HV_INTERCEPT_TYPE_MMIO = 0x00000005,
#if defined(CONFIG_X86)
HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0x00000006,
HV_INTERCEPT_TYPE_X64_APIC_SMI = 0x00000007,
#endif
HV_INTERCEPT_TYPE_HYPERCALL = 0x00000008,
#if defined(CONFIG_X86)
HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0x00000009,
HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0x0000000A,
HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0x0000000B,
HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0x0000000C,
#endif
HV_INTERCEPT_TYPE_MAX,
HV_INTERCEPT_TYPE_INVALID = 0xFFFFFFFF,
};
union hv_intercept_parameters {
/* HV_INTERCEPT_PARAMETERS is defined to be an 8-byte field. */
u64 as_uint64;
#if defined(CONFIG_X86)
/* HV_INTERCEPT_TYPE_X64_IO_PORT */
u16 io_port;
/* HV_INTERCEPT_TYPE_X64_CPUID */
u32 cpuid_index;
/* HV_INTERCEPT_TYPE_X64_APIC_WRITE */
u32 apic_write_mask;
/* HV_INTERCEPT_TYPE_EXCEPTION */
u16 exception_vector;
/* HV_INTERCEPT_TYPE_X64_MSR_INDEX */
u32 msr_index;
#endif
/* N.B. Other intercept types do not have any parameters. */
};
/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */
#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64

View File

@ -19,11 +19,24 @@
#define HV_VP_REGISTER_PAGE_VERSION_1 1u
#define HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT 7
union hv_vp_register_page_interrupt_vectors {
u64 as_uint64;
struct {
u8 vector_count;
u8 vector[HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT];
} __packed;
};
struct hv_vp_register_page {
u16 version;
u8 isvalid;
u8 rsvdz;
u32 dirty;
#if IS_ENABLED(CONFIG_X86)
union {
struct {
/* General purpose registers
@ -95,6 +108,22 @@ struct hv_vp_register_page {
union hv_x64_pending_interruption_register pending_interruption;
union hv_x64_interrupt_state_register interrupt_state;
u64 instruction_emulation_hints;
u64 xfem;
/*
* Fields from this point are not included in the register page save chunk.
* The reserved field is intended to maintain alignment for unsaved fields.
*/
u8 reserved1[0x100];
/*
* Interrupts injected as part of HvCallDispatchVp.
*/
union hv_vp_register_page_interrupt_vectors interrupt_vectors;
#elif IS_ENABLED(CONFIG_ARM64)
/* Not yet supported in ARM */
#endif
} __packed;
#define HV_PARTITION_PROCESSOR_FEATURES_BANKS 2
@ -299,10 +328,11 @@ union hv_partition_isolation_properties {
#define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2
/* Note: Exo partition is enabled by default */
#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8)
#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13)
#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19)
#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22)
#define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4)
#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8)
#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13)
#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19)
#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22)
struct hv_input_create_partition {
u64 flags;
@ -349,13 +379,23 @@ struct hv_input_set_partition_property {
enum hv_vp_state_page_type {
HV_VP_STATE_PAGE_REGISTERS = 0,
HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1,
HV_VP_STATE_PAGE_GHCB = 2,
HV_VP_STATE_PAGE_COUNT
};
struct hv_input_map_vp_state_page {
u64 partition_id;
u32 vp_index;
u32 type; /* enum hv_vp_state_page_type */
u16 type; /* enum hv_vp_state_page_type */
union hv_input_vtl input_vtl;
union {
u8 as_uint8;
struct {
u8 map_location_provided : 1;
u8 reserved : 7;
};
} flags;
u64 requested_map_location;
} __packed;
struct hv_output_map_vp_state_page {
@ -365,7 +405,14 @@ struct hv_output_map_vp_state_page {
struct hv_input_unmap_vp_state_page {
u64 partition_id;
u32 vp_index;
u32 type; /* enum hv_vp_state_page_type */
u16 type; /* enum hv_vp_state_page_type */
union hv_input_vtl input_vtl;
u8 reserved0;
} __packed;
struct hv_x64_apic_eoi_message {
u32 vp_index;
u32 interrupt_vector;
} __packed;
struct hv_opaque_intercept_message {
@ -515,6 +562,13 @@ struct hv_synthetic_timers_state {
u64 reserved[5];
} __packed;
struct hv_async_completion_message_payload {
u64 partition_id;
u32 status;
u32 completion_count;
u64 sub_status;
} __packed;
union hv_input_delete_vp {
u64 as_uint64[2];
struct {
@ -649,6 +703,57 @@ struct hv_input_set_vp_state {
union hv_input_set_vp_state_data data[];
} __packed;
union hv_x64_vp_execution_state {
u16 as_uint16;
struct {
u16 cpl:2;
u16 cr0_pe:1;
u16 cr0_am:1;
u16 efer_lma:1;
u16 debug_active:1;
u16 interruption_pending:1;
u16 vtl:4;
u16 enclave_mode:1;
u16 interrupt_shadow:1;
u16 virtualization_fault_active:1;
u16 reserved:2;
} __packed;
};
struct hv_x64_intercept_message_header {
u32 vp_index;
u8 instruction_length:4;
u8 cr8:4; /* Only set for exo partitions */
u8 intercept_access_type;
union hv_x64_vp_execution_state execution_state;
struct hv_x64_segment_register cs_segment;
u64 rip;
u64 rflags;
} __packed;
union hv_x64_memory_access_info {
u8 as_uint8;
struct {
u8 gva_valid:1;
u8 gva_gpa_valid:1;
u8 hypercall_output_pending:1;
u8 tlb_locked_no_overlay:1;
u8 reserved:4;
} __packed;
};
struct hv_x64_memory_intercept_message {
struct hv_x64_intercept_message_header header;
u32 cache_type; /* enum hv_cache_type */
u8 instruction_byte_count;
union hv_x64_memory_access_info memory_access_info;
u8 tpr_priority;
u8 reserved1;
u64 guest_virtual_address;
u64 guest_physical_address;
u8 instruction_bytes[16];
} __packed;
/*
* Dispatch state for the VP communicated by the hypervisor to the
* VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP.
@ -716,6 +821,7 @@ static_assert(sizeof(struct hv_vp_signal_pair_scheduler_message) ==
#define HV_DISPATCH_VP_FLAG_SKIP_VP_SPEC_FLUSH 0x8
#define HV_DISPATCH_VP_FLAG_SKIP_CALLER_SPEC_FLUSH 0x10
#define HV_DISPATCH_VP_FLAG_SKIP_CALLER_USER_SPEC_FLUSH 0x20
#define HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION 0x40
struct hv_input_dispatch_vp {
u64 partition_id;
@ -730,4 +836,18 @@ struct hv_output_dispatch_vp {
u32 dispatch_event; /* enum hv_vp_dispatch_event */
} __packed;
struct hv_input_modify_sparse_spa_page_host_access {
u32 host_access : 2;
u32 reserved : 30;
u32 flags;
u64 partition_id;
u64 spa_page_list[];
} __packed;
/* hv_input_modify_sparse_spa_page_host_access flags */
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE 0x1
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED 0x2
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE 0x4
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE 0x8
#endif /* _HV_HVHDK_H */

View File

@ -36,6 +36,52 @@ enum hv_scheduler_type {
HV_SCHEDULER_TYPE_MAX
};
/* HV_STATS_AREA_TYPE */
enum hv_stats_area_type {
HV_STATS_AREA_SELF = 0,
HV_STATS_AREA_PARENT = 1,
HV_STATS_AREA_INTERNAL = 2,
HV_STATS_AREA_COUNT
};
enum hv_stats_object_type {
HV_STATS_OBJECT_HYPERVISOR = 0x00000001,
HV_STATS_OBJECT_LOGICAL_PROCESSOR = 0x00000002,
HV_STATS_OBJECT_PARTITION = 0x00010001,
HV_STATS_OBJECT_VP = 0x00010002
};
union hv_stats_object_identity {
/* hv_stats_hypervisor */
struct {
u8 reserved[15];
u8 stats_area_type;
} __packed hv;
/* hv_stats_logical_processor */
struct {
u32 lp_index;
u8 reserved[11];
u8 stats_area_type;
} __packed lp;
/* hv_stats_partition */
struct {
u64 partition_id;
u8 reserved[7];
u8 stats_area_type;
} __packed partition;
/* hv_stats_vp */
struct {
u64 partition_id;
u32 vp_index;
u16 flags;
u8 reserved;
u8 stats_area_type;
} __packed vp;
};
enum hv_partition_property_code {
/* Privilege properties */
HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000,
@ -47,19 +93,45 @@ enum hv_partition_property_code {
/* Compatibility properties */
HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002,
HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007,
HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008,
HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009,
};
enum hv_snp_status {
HV_SNP_STATUS_NONE = 0,
HV_SNP_STATUS_AVAILABLE = 1,
HV_SNP_STATUS_INCOMPATIBLE = 2,
HV_SNP_STATUS_PSP_UNAVAILABLE = 3,
HV_SNP_STATUS_PSP_INIT_FAILED = 4,
HV_SNP_STATUS_PSP_BAD_FW_VERSION = 5,
HV_SNP_STATUS_BAD_CONFIGURATION = 6,
HV_SNP_STATUS_PSP_FW_UPDATE_IN_PROGRESS = 7,
HV_SNP_STATUS_PSP_RB_INIT_FAILED = 8,
HV_SNP_STATUS_PSP_PLATFORM_STATUS_FAILED = 9,
HV_SNP_STATUS_PSP_INIT_LATE_FAILED = 10,
};
enum hv_system_property {
/* Add more values when needed */
HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15,
HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21,
};
enum hv_dynamic_processor_feature_property {
/* Add more values when needed */
HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13,
HV_X64_DYNAMIC_PROCESSOR_FEATURE_SNP_STATUS = 16,
};
struct hv_input_get_system_property {
u32 property_id; /* enum hv_system_property */
union {
u32 as_uint32;
#if IS_ENABLED(CONFIG_X86)
/* enum hv_dynamic_processor_feature_property */
u32 hv_processor_feature;
#endif
/* More fields to be filled in when needed */
};
} __packed;
@ -67,9 +139,28 @@ struct hv_input_get_system_property {
struct hv_output_get_system_property {
union {
u32 scheduler_type; /* enum hv_scheduler_type */
#if IS_ENABLED(CONFIG_X86)
u64 hv_processor_feature_value;
#endif
};
} __packed;
struct hv_input_map_stats_page {
u32 type; /* enum hv_stats_object_type */
u32 padding;
union hv_stats_object_identity identity;
} __packed;
struct hv_output_map_stats_page {
u64 map_location;
} __packed;
struct hv_input_unmap_stats_page {
u32 type; /* enum hv_stats_object_type */
u32 padding;
union hv_stats_object_identity identity;
} __packed;
struct hv_proximity_domain_flags {
u32 proximity_preferred : 1;
u32 reserved : 30;

View File

@ -371,19 +371,6 @@ struct vmtransfer_page_packet_header {
struct vmtransfer_page_range ranges[];
} __packed;
struct vmgpadl_packet_header {
struct vmpacket_descriptor d;
u32 gpadl;
u32 reserved;
} __packed;
struct vmadd_remove_transfer_page_set {
struct vmpacket_descriptor d;
u32 gpadl;
u16 xfer_pageset_id;
u16 reserved;
} __packed;
/*
* This structure defines a range in guest physical space that can be made to
* look virtually contiguous.
@ -394,30 +381,6 @@ struct gpa_range {
u64 pfn_array[];
};
/*
* This is the format for an Establish Gpadl packet, which contains a handle by
* which this GPADL will be known and a set of GPA ranges associated with it.
* This can be converted to a MDL by the guest OS. If there are multiple GPA
* ranges, then the resulting MDL will be "chained," representing multiple VA
* ranges.
*/
struct vmestablish_gpadl {
struct vmpacket_descriptor d;
u32 gpadl;
u32 range_cnt;
struct gpa_range range[1];
} __packed;
/*
* This is the format for a Teardown Gpadl packet, which indicates that the
* GPADL handle in the Establish Gpadl packet will never be referenced again.
*/
struct vmteardown_gpadl {
struct vmpacket_descriptor d;
u32 gpadl;
u32 reserved; /* for alignment to a 8-byte boundary */
} __packed;
/*
* This is the format for a GPA-Direct packet, which contains a set of GPA
* ranges, in addition to commands and/or data.
@ -429,25 +392,6 @@ struct vmdata_gpa_direct {
struct gpa_range range[1];
} __packed;
/* This is the format for a Additional Data Packet. */
struct vmadditional_data {
struct vmpacket_descriptor d;
u64 total_bytes;
u32 offset;
u32 byte_cnt;
unsigned char data[1];
} __packed;
union vmpacket_largest_possible_header {
struct vmpacket_descriptor simple_hdr;
struct vmtransfer_page_packet_header xfer_page_hdr;
struct vmgpadl_packet_header gpadl_hdr;
struct vmadd_remove_transfer_page_set add_rm_xfer_page_hdr;
struct vmestablish_gpadl establish_gpadl_hdr;
struct vmteardown_gpadl teardown_gpadl_hdr;
struct vmdata_gpa_direct data_gpa_direct_hdr;
};
#define VMPACKET_DATA_START_ADDRESS(__packet) \
(void *)(((unsigned char *)__packet) + \
((struct vmpacket_descriptor)__packet)->offset8 * 8)
@ -1661,6 +1605,7 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
const guid_t *shv_host_servie_id);
int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
void vmbus_set_event(struct vmbus_channel *channel);
int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu);
/* Get the start of the ring buffer. */
static inline void *

291
include/uapi/linux/mshv.h Normal file
View File

@ -0,0 +1,291 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Userspace interfaces for /dev/mshv* devices and derived fds
*
* This file is divided into sections containing data structures and IOCTLs for
* a particular set of related devices or derived file descriptors.
*
* The IOCTL definitions are at the end of each section. They are grouped by
* device/fd, so that new IOCTLs can easily be added with a monotonically
* increasing number.
*/
#ifndef _UAPI_LINUX_MSHV_H
#define _UAPI_LINUX_MSHV_H
#include <linux/types.h>
#define MSHV_IOCTL 0xB8
/*
*******************************************
* Entry point to main VMM APIs: /dev/mshv *
*******************************************
*/
enum {
MSHV_PT_BIT_LAPIC,
MSHV_PT_BIT_X2APIC,
MSHV_PT_BIT_GPA_SUPER_PAGES,
MSHV_PT_BIT_COUNT,
};
#define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1)
enum {
MSHV_PT_ISOLATION_NONE,
MSHV_PT_ISOLATION_COUNT,
};
/**
* struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION
* @pt_flags: Bitmask of 1 << MSHV_PT_BIT_*
* @pt_isolation: MSHV_PT_ISOLATION_*
*
* Returns a file descriptor to act as a handle to a guest partition.
* At this point the partition is not yet initialized in the hypervisor.
* Some operations must be done with the partition in this state, e.g. setting
* so-called "early" partition properties. The partition can then be
* initialized with MSHV_INITIALIZE_PARTITION.
*/
struct mshv_create_partition {
__u64 pt_flags;
__u64 pt_isolation;
};
/* /dev/mshv */
#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition)
/*
************************
* Child partition APIs *
************************
*/
struct mshv_create_vp {
__u32 vp_index;
};
enum {
MSHV_SET_MEM_BIT_WRITABLE,
MSHV_SET_MEM_BIT_EXECUTABLE,
MSHV_SET_MEM_BIT_UNMAP,
MSHV_SET_MEM_BIT_COUNT
};
#define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1)
/* The hypervisor's "native" page size */
#define MSHV_HV_PAGE_SIZE 0x1000
/**
* struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY
* @size: Size of the memory region (bytes). Must be aligned to
* MSHV_HV_PAGE_SIZE
* @guest_pfn: Base guest page number to map
* @userspace_addr: Base address of userspace memory. Must be aligned to
* MSHV_HV_PAGE_SIZE
* @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP)
* is set, ignore other bits.
* @rsvd: MBZ
*
* Map or unmap a region of userspace memory to Guest Physical Addresses (GPA).
* Mappings can't overlap in GPA space or userspace.
* To unmap, these fields must match an existing mapping.
*/
struct mshv_user_mem_region {
__u64 size;
__u64 guest_pfn;
__u64 userspace_addr;
__u8 flags;
__u8 rsvd[7];
};
enum {
MSHV_IRQFD_BIT_DEASSIGN,
MSHV_IRQFD_BIT_RESAMPLE,
MSHV_IRQFD_BIT_COUNT,
};
#define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1)
struct mshv_user_irqfd {
__s32 fd;
__s32 resamplefd;
__u32 gsi;
__u32 flags;
};
enum {
MSHV_IOEVENTFD_BIT_DATAMATCH,
MSHV_IOEVENTFD_BIT_PIO,
MSHV_IOEVENTFD_BIT_DEASSIGN,
MSHV_IOEVENTFD_BIT_COUNT,
};
#define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1)
struct mshv_user_ioeventfd {
__u64 datamatch;
__u64 addr; /* legal pio/mmio address */
__u32 len; /* 1, 2, 4, or 8 bytes */
__s32 fd;
__u32 flags;
__u8 rsvd[4];
};
struct mshv_user_irq_entry {
__u32 gsi;
__u32 address_lo;
__u32 address_hi;
__u32 data;
};
struct mshv_user_irq_table {
__u32 nr;
__u32 rsvd; /* MBZ */
struct mshv_user_irq_entry entries[];
};
enum {
MSHV_GPAP_ACCESS_TYPE_ACCESSED,
MSHV_GPAP_ACCESS_TYPE_DIRTY,
MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */
};
enum {
MSHV_GPAP_ACCESS_OP_NOOP,
MSHV_GPAP_ACCESS_OP_CLEAR,
MSHV_GPAP_ACCESS_OP_SET,
MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */
};
/**
* struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP
* @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the
* bitmap
* @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all
* the access states in the range, after retrieving the current
* states.
* @rsvd: MBZ
* @page_count: Number of pages
* @gpap_base: Base gpa page number
* @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes
*
* Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest
* memory, and optionally clear or set the bits.
*/
struct mshv_gpap_access_bitmap {
__u8 access_type;
__u8 access_op;
__u8 rsvd[6];
__u64 page_count;
__u64 gpap_base;
__u64 bitmap_ptr;
};
/**
* struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL
* @code: Hypercall code (HVCALL_*)
* @reps: in: Rep count ('repcount')
* out: Reps completed ('repcomp'). MBZ unless rep hvcall
* @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE
* @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0
* @status: in: MBZ
* out: HV_STATUS_* from hypercall
* @rsvd: MBZ
* @in_ptr: Input data buffer (struct hv_input_*). If used with partition or
* vp fd, partition id field is populated by kernel.
* @out_ptr: Output data buffer (optional)
*/
struct mshv_root_hvcall {
__u16 code;
__u16 reps;
__u16 in_sz;
__u16 out_sz;
__u16 status;
__u8 rsvd[6];
__u64 in_ptr;
__u64 out_ptr;
};
/* Partition fds created with MSHV_CREATE_PARTITION */
#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00)
#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp)
#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region)
#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd)
#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd)
#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table)
#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap)
/* Generic hypercall */
#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
/*
********************************
* VP APIs for child partitions *
********************************
*/
#define MSHV_RUN_VP_BUF_SZ 256
/*
* VP state pages may be mapped to userspace via mmap().
* To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by
* the system page size.
* e.g.
* long page_size = sysconf(_SC_PAGE_SIZE);
* void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE,
* MAP_SHARED, vp_fd,
* MSHV_VP_MMAP_OFFSET_REGISTERS * page_size);
*/
enum {
MSHV_VP_MMAP_OFFSET_REGISTERS,
MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE,
MSHV_VP_MMAP_OFFSET_GHCB,
MSHV_VP_MMAP_OFFSET_COUNT
};
/**
* struct mshv_run_vp - argument for MSHV_RUN_VP
* @msg_buf: On success, the intercept message is copied here. It can be
* interpreted using the relevant hypervisor definitions.
*/
struct mshv_run_vp {
__u8 msg_buf[MSHV_RUN_VP_BUF_SZ];
};
enum {
MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */
MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */
MSHV_VP_STATE_SIMP,
MSHV_VP_STATE_SIEFP,
MSHV_VP_STATE_SYNTHETIC_TIMERS,
MSHV_VP_STATE_COUNT,
};
/**
* struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE
* @type: MSHV_VP_STATE_*
* @rsvd: MBZ
* @buf_sz: in: 4k page-aligned size of buffer
* out: Actual size of data (on EINVAL, check this to see if buffer
* was too small)
* @buf_ptr: 4k page-aligned data buffer
*/
struct mshv_get_set_vp_state {
__u8 type;
__u8 rsvd[3];
__u32 buf_sz;
__u64 buf_ptr;
};
/* VP fds created with MSHV_CREATE_VP */
#define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp)
#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state)
#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state)
/*
* Generic hypercall
* Defined above in partition IOCTLs, avoid redefining it here
* #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
*/
#endif

View File

@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);
#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)